diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CurandRandomNumberKernel.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CurandRandomNumberKernel.cc index 08a16f6f2c..c160c5e06b 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CurandRandomNumberKernel.cc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CurandRandomNumberKernel.cc @@ -10,6 +10,7 @@ #include #ifndef MGONGPU_HAS_NO_CURAND /* clang-format off */ +// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when MGONGPUCPP_GPUIMPL is not defined #include "curand.h" #define checkCurand( code ){ assertCurand( code, __FILE__, __LINE__ ); } inline void assertCurand( curandStatus_t code, const char *file, int line, bool abort = true ) diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/HiprandRandomNumberKernel.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/HiprandRandomNumberKernel.cc new file mode 100644 index 0000000000..2e4534f9d4 --- /dev/null +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/HiprandRandomNumberKernel.cc @@ -0,0 +1,145 @@ +// Copyright (C) 2020-2024 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Jan 2024) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2024) for the MG5aMC CUDACPP plugin. + +#include "mgOnGpuConfig.h" + +#include "GpuRuntime.h" +#include "MemoryBuffers.h" +#include "RandomNumberKernels.h" + +#include + +#ifndef MGONGPU_HAS_NO_HIPRAND /* clang-format off */ +#ifndef __HIP_PLATFORM_AMD__ +#define __HIP_PLATFORM_AMD__ 1 // enable hiprand for AMD (rocrand) +#endif +#include +#define checkHiprand( code ){ assertHiprand( code, __FILE__, __LINE__ ); } +inline void assertHiprand( hiprandStatus_t code, const char *file, int line, bool abort = true ) +{ + if ( code != HIPRAND_STATUS_SUCCESS ) + { + printf( "HiprandAssert: %s:%d code=%d\n", file, line, code ); + if ( abort ) assert( code == HIPRAND_STATUS_SUCCESS ); + } +} +#endif /* clang-format on */ + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + //-------------------------------------------------------------------------- +#ifndef MGONGPU_HAS_NO_HIPRAND + HiprandRandomNumberKernel::HiprandRandomNumberKernel( BufferRndNumMomenta& rnarray, const bool onDevice ) + : RandomNumberKernelBase( rnarray ) + , m_isOnDevice( onDevice ) + { + if( m_isOnDevice ) + { +#ifdef MGONGPUCPP_GPUIMPL + if( !m_rnarray.isOnDevice() ) + throw std::runtime_error( "HiprandRandomNumberKernel on device with a host random number array" ); +#else + throw std::runtime_error( "HiprandRandomNumberKernel does not support HiprandDevice on CPU host" ); +#endif + } + else + { + if( m_rnarray.isOnDevice() ) + throw std::runtime_error( "HiprandRandomNumberKernel on host with a device random number array" ); + } + createGenerator(); + } + + //-------------------------------------------------------------------------- + + HiprandRandomNumberKernel::~HiprandRandomNumberKernel() + { + destroyGenerator(); + } + + //-------------------------------------------------------------------------- + + void HiprandRandomNumberKernel::seedGenerator( const unsigned int seed ) + { + if( m_isOnDevice ) + { + destroyGenerator(); // workaround for #429 + createGenerator(); // workaround for #429 + } + //printf( "seedGenerator: seed %d\n", seed ); + checkHiprand( hiprandSetPseudoRandomGeneratorSeed( m_rnGen, seed ) ); + } + + //-------------------------------------------------------------------------- + + void HiprandRandomNumberKernel::createGenerator() + { + //const hiprandRngType_t type = HIPRAND_RNG_PSEUDO_DEFAULT; + //const hiprandRngType_t type = HIPRAND_RNG_PSEUDO_XORWOW; + //const hiprandRngType_t type = HIPRAND_RNG_PSEUDO_MRG32K3A; + const hiprandRngType_t type = HIPRAND_RNG_PSEUDO_MTGP32; // same as curand; not implemented yet (code=1000) in host code + //const hiprandRngType_t type = HIPRAND_RNG_PSEUDO_MT19937; + //const hiprandRngType_t type = HIPRAND_RNG_PSEUDO_PHILOX4_32_10; + if( m_isOnDevice ) + { + checkHiprand( hiprandCreateGenerator( &m_rnGen, type ) ); + } + else + { + // See https://github.com/ROCm/hipRAND/issues/76 + throw std::runtime_error( "HiprandRandomNumberKernel on host is not supported yet (hiprandCreateGeneratorHost is not implemented yet)" ); + //checkHiprand( hiprandCreateGeneratorHost( &m_rnGen, type ) ); // ALWAYS FAILS WITH CODE=1000 + } + // FIXME: hiprand ordering is not implemented yet + // See https://github.com/ROCm/hipRAND/issues/75 + /* + //checkHiprand( hiprandSetGeneratorOrdering( *&m_rnGen, HIPRAND_ORDERING_PSEUDO_LEGACY ) ); + checkHiprand( hiprandSetGeneratorOrdering( *&m_rnGen, HIPRAND_ORDERING_PSEUDO_BEST ) ); + //checkHiprand( hiprandSetGeneratorOrdering( *&m_rnGen, HIPRAND_ORDERING_PSEUDO_DYNAMIC ) ); + //checkHiprand( hiprandSetGeneratorOrdering( *&m_rnGen, HIPRAND_ORDERING_PSEUDO_SEEDED ) ); + */ + } + + //-------------------------------------------------------------------------- + + void HiprandRandomNumberKernel::destroyGenerator() + { + checkHiprand( hiprandDestroyGenerator( m_rnGen ) ); + } + + //-------------------------------------------------------------------------- + + void HiprandRandomNumberKernel::generateRnarray() + { +#if defined MGONGPU_FPTYPE_DOUBLE + checkHiprand( hiprandGenerateUniformDouble( m_rnGen, m_rnarray.data(), m_rnarray.size() ) ); +#elif defined MGONGPU_FPTYPE_FLOAT + checkHiprand( hiprandGenerateUniform( m_rnGen, m_rnarray.data(), m_rnarray.size() ) ); +#endif + /* + printf( "\nHiprandRandomNumberKernel::generateRnarray size = %d\n", (int)m_rnarray.size() ); + fptype* data = m_rnarray.data(); +#ifdef MGONGPUCPP_GPUIMPL + if( m_rnarray.isOnDevice() ) + { + data = new fptype[m_rnarray.size()](); + checkCuda( cudaMemcpy( data, m_rnarray.data(), m_rnarray.bytes(), cudaMemcpyDeviceToHost ) ); + } +#endif + for( int i = 0; i < ( (int)m_rnarray.size() / 4 ); i++ ) + printf( "[%4d] %f %f %f %f\n", i * 4, data[i * 4], data[i * 4 + 2], data[i * 4 + 2], data[i * 4 + 3] ); +#ifdef MGONGPUCPP_GPUIMPL + if( m_rnarray.isOnDevice() ) delete[] data; +#endif + */ + } + + //-------------------------------------------------------------------------- +#endif +} diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/RandomNumberKernels.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/RandomNumberKernels.h index 21d63beeac..7ed728a26c 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/RandomNumberKernels.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/RandomNumberKernels.h @@ -1,21 +1,22 @@ -// Copyright (C) 2020-2023 CERN and UCLouvain. +// Copyright (C) 2020-2024 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. #ifndef RANDOMNUMBERKERNELS_H #define RANDOMNUMBERKERNELS_H 1 #include "mgOnGpuConfig.h" -// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when MGONGPUCPP_GPUIMPL is not defined -#ifndef MGONGPU_HAS_NO_CURAND -//#include "curand.h" -struct curandGenerator_st; // forward definition from curand.h -#endif - #include "MemoryBuffers.h" +// Forward definition from curand.h (the full header is only needed in CurandRandomKernel.cc) +struct curandGenerator_st; + +// Forward definition from hiprand.h (the full header is only needed in HiprandRandomKernel.cc) +struct rocrand_generator_base_type; +typedef rocrand_generator_base_type hiprandGenerator_st; + #ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else @@ -107,7 +108,6 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef MGONGPU_HAS_NO_CURAND // A class encapsulating CURAND random number generation on a CPU host or on a GPU device class CurandRandomNumberKernel final : public RandomNumberKernelBase { @@ -142,11 +142,49 @@ namespace mg5amcCpu const bool m_isOnDevice; // The curand generator - // (NB: curand.h defines typedef generator_t as a pointer to forward-defined 'struct curandGenerator_st') + // (NB: curand.h defines typedef curandGenerator_t as a pointer to forward-defined 'struct curandGenerator_st') curandGenerator_st* m_rnGen; }; -#endif + //-------------------------------------------------------------------------- + + // A class encapsulating HIPRAND random number generation on a CPU host or on a GPU device + class HiprandRandomNumberKernel final : public RandomNumberKernelBase + { + public: + + // Constructor from an existing output buffer + HiprandRandomNumberKernel( BufferRndNumMomenta& rnarray, const bool onDevice ); + + // Destructor + ~HiprandRandomNumberKernel(); + + // Seed the random number generator + void seedGenerator( const unsigned int seed ) override final; + + // Generate the random number array + void generateRnarray() override final; + + // Is this a host or device kernel? + bool isOnDevice() const override final { return m_isOnDevice; } + + private: + + // Create the generator (workaround for #429: do this in every seedGenerator call rather than only in the ctor) + void createGenerator(); + + // Destroy the generator (workaround for #429: do this in every seedGenerator call rather than only in the ctor) + void destroyGenerator(); + + private: + + // Is this a host or device kernel? + const bool m_isOnDevice; + + // The hiprand generator + // (NB: hiprand.h defines typedef hiprandGenerator_t as a pointer to forward-defined 'struct hiprandGenerator_st') + hiprandGenerator_st* m_rnGen; + }; //-------------------------------------------------------------------------- } diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/check_sa.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/check_sa.cc index bde384c69e..e086ae12d9 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/check_sa.cc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/check_sa.cc @@ -1,10 +1,10 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2023 CERN and UCLouvain. +// Copyright (C) 2020-2024 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -58,7 +58,7 @@ int usage( char* argv0, int ret = 1 ) { std::cout << "Usage: " << argv0 - << " [--verbose|-v] [--debug|-d] [--performance|-p] [--json|-j] [--curhst|--curdev|--common] [--rmbhst|--rmbdev] [--bridge]" + << " [--verbose|-v] [--debug|-d] [--performance|-p] [--json|-j] [--curhst|--curdev|--hirhst|--hirdev|--common] [--rmbhst|--rmbdev] [--bridge]" << " [#gpuBlocksPerGrid #gpuThreadsPerBlock] #iterations" << std::endl; std::cout << std::endl; std::cout << "The number of events per iteration is #gpuBlocksPerGrid * #gpuThreadsPerBlock" << std::endl; @@ -131,17 +131,31 @@ main( int argc, char** argv ) enum class RandomNumberMode { CommonRandom = 0, - CurandHost = 1, - CurandDevice = 2 + CurandHost = -1, + CurandDevice = 1, + HiprandHost = -2, + HiprandDevice = 2 }; -#ifdef MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785) -#elif defined __HIPCC__ -#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random -#elif defined __CUDACC__ +#if defined __CUDACC__ +#ifndef MGONGPU_HAS_NO_CURAND RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand #else + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on NVidia GPU if build has no curand (PR #784 and #785) +#endif +#elif defined __HIPCC__ +#ifndef MGONGPU_HAS_NO_HIPRAND + RandomNumberMode rndgen = RandomNumberMode::HiprandDevice; // default on AMD GPU if build has hiprand +#else + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on AMD GPU if build has no hiprand +#endif +#else +#ifndef MGONGPU_HAS_NO_CURAND RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand +#elif not defined MGONGPU_HAS_NO_HIPRAND + RandomNumberMode rndgen = RandomNumberMode::HiprandDevice; // default on CPU if build has hiprand +#else + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has neither curand nor hiprand +#endif #endif // Rambo sampling mode (NB RamboHost implies CommonRandom or CurandHost!) enum class RamboSamplingMode @@ -152,7 +166,7 @@ main( int argc, char** argv ) #ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -193,6 +207,26 @@ main( int argc, char** argv ) throw std::runtime_error( "CurandHost is not supported because this application was built without Curand support" ); #else rndgen = RandomNumberMode::CurandHost; +#endif + } + else if( arg == "--hirdev" ) + { +#ifndef __HIPCC__ + throw std::runtime_error( "HiprandDevice is not supported on CPUs or non-AMD GPUs" ); +#elif defined MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "HiprandDevice is not supported because this application was built without Hiprand support" ); +#else + rndgen = RandomNumberMode::HiprandDevice; +#endif + } + else if( arg == "--hirhst" ) + { +#ifdef MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "HiprandHost is not supported because this application was built without Hiprand support" ); +#else + // See https://github.com/ROCm/hipRAND/issues/76 + throw std::runtime_error( "HiprandRandomNumberKernel on host is not supported yet (hiprandCreateGeneratorHost is not implemented yet)" ); + //rndgen = RandomNumberMode::HiprandHost; #endif } else if( arg == "--common" ) @@ -265,6 +299,20 @@ main( int argc, char** argv ) #endif } + if( rmbsmp == RamboSamplingMode::RamboHost && rndgen == RandomNumberMode::HiprandDevice ) + { +#if not defined MGONGPU_HAS_NO_HIPRAND + // See https://github.com/ROCm/hipRAND/issues/76 + //std::cout << "WARNING! RamboHost selected: cannot use HiprandDevice, will use HiprandHost" << std::endl; + //rndgen = RandomNumberMode::HiprandHost; + std::cout << "WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet)" << std::endl; + rndgen = RandomNumberMode::CommonRandom; +#else + std::cout << "WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom" << std::endl; + rndgen = RandomNumberMode::CommonRandom; +#endif + } + constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout constexpr int neppR = MemoryAccessRandomNumbers::neppR; // AOSOA layout @@ -415,7 +463,7 @@ main( int argc, char** argv ) std::unique_ptr wavetimes( new double[niter] ); std::unique_ptr wv3atimes( new double[niter] ); - // --- 0c. Create curand or common generator + // --- 0c. Create curand, hiprand or common generator const std::string cgenKey = "0c GenCreat"; timermap.start( cgenKey ); // Allocate the appropriate RandomNumberKernel @@ -433,7 +481,7 @@ main( int argc, char** argv ) prnk.reset( new CurandRandomNumberKernel( hstRndmom, onDevice ) ); #endif } - else + else if( rndgen == RandomNumberMode::CurandDevice ) { #ifdef MGONGPU_HAS_NO_CURAND throw std::runtime_error( "INTERNAL ERROR! CurandDevice is not supported because this application was built without Curand support" ); // INTERNAL ERROR (no path to this statement) @@ -441,9 +489,31 @@ main( int argc, char** argv ) const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); #else - throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif } + else if( rndgen == RandomNumberMode::HiprandHost ) + { +#ifdef MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "INTERNAL ERROR! HiprandHost is not supported because this application was built without Hiprand support" ); // INTERNAL ERROR (no path to this statement) +#else + const bool onDevice = false; + prnk.reset( new HiprandRandomNumberKernel( hstRndmom, onDevice ) ); +#endif + } + else if( rndgen == RandomNumberMode::HiprandDevice ) + { +#ifdef MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "INTERNAL ERROR! HiprandDevice is not supported because this application was built without Hiprand support" ); // INTERNAL ERROR (no path to this statement) +#elif defined __HIPCC__ + const bool onDevice = true; + prnk.reset( new HiprandRandomNumberKernel( devRndmom, onDevice ) ); +#else + throw std::logic_error( "INTERNAL ERROR! HiprandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) +#endif + } + else + throw std::logic_error( "INTERNAL ERROR! Unknown rndgen value?" ); // INTERNAL ERROR (no path to this statement) // --- 0c. Create rambo sampling kernel [keep this in 0c for the moment] std::unique_ptr prsk; @@ -497,7 +567,7 @@ main( int argc, char** argv ) // *** START THE OLD-STYLE TIMER FOR RANDOM GEN *** double genrtime = 0; - // --- 1a. Seed rnd generator (to get same results on host and device in curand) + // --- 1a. Seed rnd generator (to get same results on host and device in curand/hiprand) // [NB This should not be necessary using the host API: "Generation functions // can be called multiple times on the same generator to generate successive // blocks of results. For pseudorandom generators, multiple calls to generation @@ -515,7 +585,9 @@ main( int argc, char** argv ) //std::cout << "Got random numbers" << std::endl; #ifdef MGONGPUCPP_GPUIMPL - if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) + if( rndgen != RandomNumberMode::CurandDevice && + rndgen != RandomNumberMode::HiprandDevice && + rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device const std::string htodKey = "1c CpHTDrnd"; @@ -761,6 +833,10 @@ main( int argc, char** argv ) rndgentxt = "CURAND HOST"; else if( rndgen == RandomNumberMode::CurandDevice ) rndgentxt = "CURAND DEVICE"; + else if( rndgen == RandomNumberMode::HiprandHost ) + rndgentxt = "ROCRAND HOST"; + else if( rndgen == RandomNumberMode::HiprandDevice ) + rndgentxt = "ROCRAND DEVICE"; #ifdef __CUDACC__ rndgentxt += " (CUDA code)"; #elif defined __HIPCC__ @@ -788,7 +864,7 @@ main( int argc, char** argv ) wrkflwtxt += "FLT+"; #else wrkflwtxt += "???+"; // no path to this statement -#endif /* clang-format on */ +#endif // -- CUCOMPLEX or THRUST or STD or CXSIMPLE complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -813,7 +889,7 @@ main( int argc, char** argv ) wrkflwtxt += "CXS:"; #else wrkflwtxt += "???:"; // no path to this statement -#endif +#endif /* clang-format on */ #endif // -- COMMON or CURAND HOST or CURAND DEVICE random numbers? if( rndgen == RandomNumberMode::CommonRandom ) @@ -822,6 +898,10 @@ main( int argc, char** argv ) wrkflwtxt += "CURHST+"; else if( rndgen == RandomNumberMode::CurandDevice ) wrkflwtxt += "CURDEV+"; + else if( rndgen == RandomNumberMode::HiprandHost ) + wrkflwtxt += "HIRHST+"; + else if( rndgen == RandomNumberMode::HiprandDevice ) + wrkflwtxt += "HIRDEV+"; else wrkflwtxt += "??????+"; // no path to this statement // -- HOST or DEVICE rambo sampling? @@ -1099,7 +1179,7 @@ main( int argc, char** argv ) #ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif - << "\"Curand generation\": " + << "\"Random generation\": " << "\"" << rndgentxt << "\"," << std::endl; double minelem = hstStats.minME; diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk index 70b90afcd3..efe82df88d 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk @@ -173,11 +173,6 @@ ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) comma:=, CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch)) CUINC = -I$(CUDA_HOME)/include/ - ifeq ($(RNDGEN),hasNoCurand) - CURANDLIBFLAGS= - else - CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here! - endif CUOPTFLAGS = -lineinfo ###GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math GPUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math @@ -241,7 +236,7 @@ else override GPUCC= override USE_NVTX= override CUINC= - override CURANDLIBFLAGS= + override HIPINC= endif @@ -291,7 +286,7 @@ endif #------------------------------------------------------------------------------- -#=== Configure defaults and check if user-defined choices exist for OMPFLAGS, AVX, FPTYPE, HELINL, HRDCOD, RNDGEN +#=== Configure defaults and check if user-defined choices exist for OMPFLAGS, AVX, FPTYPE, HELINL, HRDCOD # Set the default OMPFLAGS choice ifneq ($(findstring hipcc,$(GPUCC)),) @@ -352,29 +347,62 @@ ifeq ($(HRDCOD),) override HRDCOD = 0 endif -# Set the default RNDGEN (random number generator) choice -ifeq ($(RNDGEN),) - ifeq ($(GPUCC),) - override RNDGEN = hasNoCurand - # Edgecase for HIP compilation - else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) - override RNDGEN = hasNoCurand - else ifeq ($(RNDGEN),) - override RNDGEN = hasCurand - endif -endif - -# Export AVX, FPTYPE, HELINL, HRDCOD, RNDGEN, OMPFLAGS so that it is not necessary to pass them to the src Makefile too +# Export AVX, FPTYPE, HELINL, HRDCOD, OMPFLAGS so that it is not necessary to pass them to the src Makefile too export AVX export FPTYPE export HELINL export HRDCOD -export RNDGEN export OMPFLAGS #------------------------------------------------------------------------------- -#=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN +#=== Configure defaults and check if user-defined choices exist for RNDGEN (legacy!), HASCURAND, HASHIPRAND + +# If the legacy RNDGEN exists, this take precedence over any HASCURAND choice (but a warning is printed out) +###$(info RNDGEN=$(RNDGEN)) +ifneq ($(RNDGEN),) + $(warning Environment variable RNDGEN is no longer supported, please use HASCURAND instead!) + ifeq ($(RNDGEN),hasCurand) + override HASCURAND = $(RNDGEN) + else ifeq ($(RNDGEN),hasNoCurand) + override HASCURAND = $(RNDGEN) + else ifneq ($(RNDGEN),hasNoCurand) + $(error Unknown RNDGEN='$(RNDGEN)': only 'hasCurand' and 'hasNoCurand' are supported - but use HASCURAND instead!) + endif +endif + +# Set the default HASCURAND (curand random number generator) choice, if no prior choice exists for HASCURAND +# (NB: allow HASCURAND=hasCurand even if $(GPUCC) does not point to nvcc: assume CUDA_HOME was defined correctly...) +ifeq ($(HASCURAND),) + ifeq ($(GPUCC),) # CPU-only build + override HASCURAND = hasNoCurand + else ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build + override HASCURAND = hasCurand + else # non-Nvidia GPU build + override HASCURAND = hasNoCurand + endif +endif + +# Set the default HASHIPRAND (hiprand random number generator) choice, if no prior choice exists for HASHIPRAND +# (NB: allow HASHIPRAND=hasHiprand even if $(GPUCC) does not point to hipcc: assume HIP_HOME was defined correctly...) +ifeq ($(HASHIPRAND),) + ifeq ($(GPUCC),) # CPU-only build + override HASHIPRAND = hasNoHiprand + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build + override HASHIPRAND = hasHiprand + else # non-AMD GPU build + override HASHIPRAND = hasNoHiprand + endif +endif + +# Export HASCURAND, HASHIPRAND so that it is not necessary to pass them to the src Makefile too +# (NB: these variables in cudacpp_src.mk are only used to define the build tag, they are NOT needed for RNDCXXFLAGS or RNDLIBFLAGS) +export HASCURAND +export HASHIPRAND + +#------------------------------------------------------------------------------- + +#=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD # Set the build flags appropriate to OMPFLAGS $(info OMPFLAGS=$(OMPFLAGS)) @@ -432,13 +460,13 @@ CXXFLAGS+= $(AVXFLAGS) $(info FPTYPE=$(FPTYPE)) ifeq ($(FPTYPE),d) CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE - GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE + GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE else ifeq ($(FPTYPE),f) CXXFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT - GPUFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT + GPUFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT else ifeq ($(FPTYPE),m) CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT - GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT + GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT else $(error Unknown FPTYPE='$(FPTYPE)': only 'd', 'f' and 'm' are supported) endif @@ -447,7 +475,7 @@ endif $(info HELINL=$(HELINL)) ifeq ($(HELINL),1) CXXFLAGS += -DMGONGPU_INLINE_HELAMPS - GPUFLAGS += -DMGONGPU_INLINE_HELAMPS + GPUFLAGS += -DMGONGPU_INLINE_HELAMPS else ifneq ($(HELINL),0) $(error Unknown HELINL='$(HELINL)': only '0' and '1' are supported) endif @@ -456,21 +484,40 @@ endif $(info HRDCOD=$(HRDCOD)) ifeq ($(HRDCOD),1) CXXFLAGS += -DMGONGPU_HARDCODE_PARAM - GPUFLAGS += -DMGONGPU_HARDCODE_PARAM + GPUFLAGS += -DMGONGPU_HARDCODE_PARAM else ifneq ($(HRDCOD),0) $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported) endif -# Set the build flags appropriate to each RNDGEN choice (example: "make RNDGEN=hasNoCurand") -$(info RNDGEN=$(RNDGEN)) -ifeq ($(RNDGEN),hasNoCurand) - override CXXFLAGSCURAND = -DMGONGPU_HAS_NO_CURAND -else ifeq ($(RNDGEN),hasCurand) - override CXXFLAGSCURAND = + +#=== Set the CUDA/HIP/C++ compiler and linker flags appropriate to user-defined choices of HASCURAND, HASHIPRAND + +$(info HASCURAND=$(HASCURAND)) +$(info HASHIPRAND=$(HASHIPRAND)) +override RNDCXXFLAGS= +override RNDLIBFLAGS= + +# Set the RNDCXXFLAGS and RNDLIBFLAGS build flags appropriate to each HASCURAND choice (example: "make HASCURAND=hasNoCurand") +ifeq ($(HASCURAND),hasNoCurand) + override RNDCXXFLAGS += -DMGONGPU_HAS_NO_CURAND +else ifeq ($(HASCURAND),hasCurand) + override RNDLIBFLAGS += -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here! else - $(error Unknown RNDGEN='$(RNDGEN)': only 'hasCurand' and 'hasNoCurand' are supported) + $(error Unknown HASCURAND='$(HASCURAND)': only 'hasCurand' and 'hasNoCurand' are supported) +endif + +# Set the RNDCXXFLAGS and RNDLIBFLAGS build flags appropriate to each HASHIPRAND choice (example: "make HASHIPRAND=hasNoHiprand") +ifeq ($(HASHIPRAND),hasNoHiprand) + override RNDCXXFLAGS += -DMGONGPU_HAS_NO_HIPRAND +else ifeq ($(HASHIPRAND),hasHiprand) + override RNDLIBFLAGS += -L$(HIP_HOME)/lib/ -lhiprand +else ifneq ($(HASHIPRAND),hasHiprand) + $(error Unknown HASHIPRAND='$(HASHIPRAND)': only 'hasHiprand' and 'hasNoHiprand' are supported) endif +#$(info RNDCXXFLAGS=$(RNDCXXFLAGS)) +#$(info HASHIPRAND=$(HASHIPRAND)) + #------------------------------------------------------------------------------- #=== Configure build directories and build lockfiles === @@ -481,7 +528,7 @@ override DIRTAG = $(AVX)_$(FPTYPE)_inl$(HELINL)_hrd$(HRDCOD) # Build lockfile "full" tag (defines full specification of build options that cannot be intermixed) # (Rationale: avoid mixing of CUDA and no-CUDA environment builds with different random number generators) -override TAG = $(AVX)_$(FPTYPE)_inl$(HELINL)_hrd$(HRDCOD)_$(RNDGEN) +override TAG = $(AVX)_$(FPTYPE)_inl$(HELINL)_hrd$(HRDCOD)_$(HASCURAND)_$(HASHIPRAND) # Build directory: current directory by default, or build.$(DIRTAG) if USEBUILDDIR==1 ifeq ($(USEBUILDDIR),1) @@ -589,14 +636,19 @@ endif $(BUILDDIR)/check_sa.o: CXXFLAGS += $(USE_NVTX) $(CUINC) $(BUILDDIR)/check_sa_cu.o: CXXFLAGS += $(USE_NVTX) $(CUINC) -# Apply special build flags only to check_sa[_cu].o and CurandRandomNumberKernel[_cu].o (curand headers, #679) -$(BUILDDIR)/check_sa.o: CXXFLAGS += $(CXXFLAGSCURAND) -$(BUILDDIR)/check_sa_cu.o: CUFLAGS += $(CXXFLAGSCURAND) -$(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CXXFLAGSCURAND) -$(BUILDDIR)/CurandRandomNumberKernel_cu.o: CUFLAGS += $(CXXFLAGSCURAND) -ifeq ($(RNDGEN),hasCurand) +# Apply special build flags only to check_sa[_cu].o and (Cu|Hip)randRandomNumberKernel[_cu].o +$(BUILDDIR)/check_sa.o: CXXFLAGS += $(RNDCXXFLAGS) +$(BUILDDIR)/check_sa_cu.o: CUFLAGS += $(RNDCXXFLAGS) +$(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(RNDCXXFLAGS) +$(BUILDDIR)/CurandRandomNumberKernel_cu.o: CUFLAGS += $(RNDCXXFLAGS) +$(BUILDDIR)/HiprandRandomNumberKernel.o: CXXFLAGS += $(RNDCXXFLAGS) +$(BUILDDIR)/HiprandRandomNumberKernel_cu.o: CUFLAGS += $(RNDCXXFLAGS) +ifeq ($(HASCURAND),hasCurand) # curand headers, #679 $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUINC) endif +ifeq ($(HASHIPRAND),hasHiprand) # hiprand headers +$(BUILDDIR)/HiprandRandomNumberKernel.o: CXXFLAGS += $(HIPINC) +endif # Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592) ifneq ($(shell $(CXX) --version | egrep '^(Intel)'),) @@ -673,8 +725,8 @@ endif # Target (and build rules): C++ and CUDA standalone executables $(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o - $(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CURANDLIBFLAGS) +$(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(BUILDDIR)/HiprandRandomNumberKernel.o + $(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(BUILDDIR)/HiprandRandomNumberKernel.o $(RNDLIBFLAGS) ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) @@ -684,8 +736,8 @@ else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(cu_main): LIBFLAGS += -L$(patsubst %%bin/nvc++,%%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif $(cu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(cu_main): $(BUILDDIR)/check_sa_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_cu.o - $(GPUCC) -o $@ $(BUILDDIR)/check_sa_cu.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_cu.o $(CURANDLIBFLAGS) +$(cu_main): $(BUILDDIR)/check_sa_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_cu.o $(BUILDDIR)/HiprandRandomNumberKernel_cu.o + $(GPUCC) -o $@ $(BUILDDIR)/check_sa_cu.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_cu.o $(BUILDDIR)/HiprandRandomNumberKernel_cu.o $(RNDLIBFLAGS) endif #------------------------------------------------------------------------------- diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp_src.mk b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp_src.mk index 49ccf0c4e3..f1e79433b1 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp_src.mk +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp_src.mk @@ -45,13 +45,13 @@ endif #------------------------------------------------------------------------------- -#=== Configure the CUDA compiler (note: GPUCC is already exported including ccache) +#=== Configure the CUDA compiler (note: GPUCC have been exported from cudacpp.mk including ccache) ###$(info GPUCC=$(GPUCC)) #------------------------------------------------------------------------------- -#=== Configure ccache for C++ builds (note: GPUCC is already exported including ccache) +#=== Configure ccache for C++ builds (note: GPUCC have been exported from cudacpp.mk including ccache) # Enable ccache if USECCACHE=1 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) @@ -86,7 +86,8 @@ endif #------------------------------------------------------------------------------- -#=== Set the CUDA/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN +#=== Set the CUDA/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD (exported from cudacpp.mk) +#=== (NB the RNDCXXFLAGS and RNDLIBFLAGS appropriate to user-defined choices of HASCURAND and HASHIPRAND have been exported from cudacpp.mk) # Set the build flags appropriate to OMPFLAGS ###$(info OMPFLAGS=$(OMPFLAGS)) @@ -175,14 +176,6 @@ else ifneq ($(HRDCOD),0) $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported) endif -# Set the build flags appropriate to each RNDGEN choice (example: "make RNDGEN=hasNoCurand") -###$(info RNDGEN=$(RNDGEN)) -ifeq ($(RNDGEN),hasNoCurand) - CXXFLAGS += -DMGONGPU_HAS_NO_CURAND -else ifneq ($(RNDGEN),hasCurand) - $(error Unknown RNDGEN='$(RNDGEN)': only 'hasCurand' and 'hasNoCurand' are supported) -endif - #------------------------------------------------------------------------------- #=== Configure build directories and build lockfiles === @@ -193,7 +186,7 @@ override DIRTAG = $(AVX)_$(FPTYPE)_inl$(HELINL)_hrd$(HRDCOD) # Build lockfile "full" tag (defines full specification of build options that cannot be intermixed) # (Rationale: avoid mixing of CUDA and no-CUDA environment builds with different random number generators) -override TAG = $(AVX)_$(FPTYPE)_inl$(HELINL)_hrd$(HRDCOD)_$(RNDGEN) +override TAG = $(AVX)_$(FPTYPE)_inl$(HELINL)_hrd$(HRDCOD)_$(HASCURAND)_$(HASHIPRAND) # Build directory: current directory by default, or build.$(DIRTAG) if USEBUILDDIR==1 ###$(info Current directory is $(shell pwd)) diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h index 05013cf981..506751fe1e 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h @@ -20,11 +20,15 @@ #undef MGONGPUCPP_GPUIMPL #endif +// Make sure that __HIP_PLATFORM_NVIDIA__ is undefined +// (__HIP_PLATFORM_AMD__ is defined by hipcc or in HiprandRandomNumberKernel.cc) +#undef __HIP_PLATFORM_NVIDIA__ // disable hiprand for NVidia (curand) + // ** NB1 Throughputs (e.g. 6.8E8) are events/sec for "./gcheck.exe -p 65536 128 12" // ** NB2 Baseline on b7g47n0004 fluctuates (probably depends on load on other VMs) // Choose if curand is supported for generating random numbers -// For HIP, by default, do not use curand (common random numbers will be used instead) +// For HIP, by default, do not allow curand to be used (hiprand or common random numbers will be used instead) // For both CUDA and C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND // (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784 and #785) #if defined __HIPCC__ @@ -39,6 +43,22 @@ //#endif #endif +// Choose if hiprand is supported for generating random numbers +// For CUDA, by default, do not allow hiprand to be used (curand or common random numbers will be used instead) +// For both HIP and C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_HIPRAND +// (there may exist HIP installations which do not include hiprand?) +#if defined __CUDACC__ +#define MGONGPU_HAS_NO_HIPRAND 1 +#else +//#ifdef __HIPCC__ +//#undef MGONGPU_HAS_NO_HIPRAND // default +////#define MGONGPU_HAS_NO_HIPRAND 1 +//#else +//#undef MGONGPU_HAS_NO_HIPRAND // default +////#define MGONGPU_HAS_NO_HIPRAND 1 +//#endif +#endif + // Choose floating point precision (for everything but color algebra #537) // If one of these macros has been set from outside with e.g. -DMGONGPU_FPTYPE_FLOAT, nothing happens (issue #167) #if not defined MGONGPU_FPTYPE_DOUBLE and not defined MGONGPU_FPTYPE_FLOAT diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/runTest.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/runTest.cc index de327f2321..7f248d29a4 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/runTest.cc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/runTest.cc @@ -238,7 +238,7 @@ struct CUDATest : public CUDA_CPU_TestBase return MemoryAccessMatrixElements::ieventAccessConst( hstMatrixElements.data(), ievt ); } }; -#endif +#endif /* clang-format off */ // Use two levels of macros to force stringification at the right level // (see https://gcc.gnu.org/onlinedocs/gcc-3.0.1/cpp_3.html#SEC17 and https://stackoverflow.com/a/3419392) @@ -260,4 +260,4 @@ INSTANTIATE_TEST_SUITE_P( prefix, \ MG_INSTANTIATE_TEST_SUITE_GPU( XTESTID_GPU( MG_EPOCH_PROCESS_ID ), MadgraphTest ); #else MG_INSTANTIATE_TEST_SUITE_CPU( XTESTID_CPU( MG_EPOCH_PROCESS_ID ), MadgraphTest ); -#endif +#endif /* clang-format on */ diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py index 690b6c20f4..de4d28ad16 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py @@ -1338,7 +1338,6 @@ def generate_process_files(self): self.edit_testxxx() # AV new file (NB this is generic in Subprocesses and then linked in Sigma-specific) self.edit_memorybuffers() # AV new file (NB this is generic in Subprocesses and then linked in Sigma-specific) self.edit_memoryaccesscouplings() # AV new file (NB this is generic in Subprocesses and then linked in Sigma-specific) - # Add symbolic links in the P1 directory # NB: symlink of cudacpp.mk to makefile is overwritten by madevent makefile if this exists (#480) # NB: this relies on the assumption that cudacpp code is generated before madevent code files.ln(pjoin(self.path, 'cudacpp.mk'), self.path, 'makefile') diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py index c89295c01f..f2ed8897b3 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py @@ -101,8 +101,8 @@ class PLUGIN_ProcessExporter(PLUGIN_export_cpp.ProcessExporterGPU): s+'gpu/CrossSectionKernels.cc', s+'gpu/CrossSectionKernels.h', s+'gpu/MatrixElementKernels.cc', s+'gpu/MatrixElementKernels.h', s+'gpu/RamboSamplingKernels.cc', s+'gpu/RamboSamplingKernels.h', - s+'gpu/RandomNumberKernels.h', - s+'gpu/CommonRandomNumberKernel.cc', s+'gpu/CurandRandomNumberKernel.cc', + s+'gpu/RandomNumberKernels.h', s+'gpu/CommonRandomNumberKernel.cc', + s+'gpu/CurandRandomNumberKernel.cc', s+'gpu/HiprandRandomNumberKernel.cc', s+'gpu/Bridge.h', s+'gpu/BridgeKernels.cc', s+'gpu/BridgeKernels.h', s+'gpu/fbridge.cc', s+'gpu/fbridge.inc', s+'gpu/fsampler.cc', s+'gpu/fsampler.inc', s+'gpu/MadgraphTest.h', s+'gpu/runTest.cc', @@ -122,7 +122,8 @@ class PLUGIN_ProcessExporter(PLUGIN_export_cpp.ProcessExporterGPU): 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', - 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', + 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', + 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', diff --git a/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt b/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt index 9be00fc071..122537896c 100644 --- a/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt +++ b/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt @@ -62,7 +62,7 @@ generate e+ e- > mu+ mu- No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005546092987060547  +DEBUG: model prefixing takes 0.00548553466796875  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -163,10 +163,10 @@ It has been validated for the last time with version: 3.5.2 Addition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT Output will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: cformat =  standalone_cudacpp [export_cpp.py at line 3071]  -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 160]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 161]  INFO: initialize a new directory: CODEGEN_mad_ee_mumu INFO: remove old information in CODEGEN_mad_ee_mumu -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 165]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 166]  WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu  INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards  @@ -176,7 +176,7 @@ INFO: Generating Helas calls for process: e+ e- > mu+ mu- WEIGHTED<=4 @1 INFO: Processing color information for process: e+ e- > mu+ mu- @1 INFO: Creating files in directory P1_epem_mupmum DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1057]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -193,13 +193,13 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. INFO: Generating Feynman diagrams for Process: e+ e- > mu+ mu- WEIGHTED<=4 @1 INFO: Finding symmetric diagrams for subprocess group epem_mupmum Generated helas calls for 1 subprocesses (2 diagrams) in 0.004 s -Wrote files for 8 helas calls in 0.101 s +Wrote files for 8 helas calls in 0.098 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFV2 routines ALOHA: aloha creates FFV4 routines -ALOHA: aloha creates 3 routines in 0.205 s -DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202]  +ALOHA: aloha creates 3 routines in 0.198 s +DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 203]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFV2 routines @@ -243,16 +243,16 @@ patching file matrix1.f Hunk #3 succeeded at 230 (offset 9 lines). Hunk #4 succeeded at 267 (offset 18 lines). Hunk #5 succeeded at 312 (offset 18 lines). -DEBUG: p.returncode =  0 [output.py at line 237]  +DEBUG: p.returncode =  0 [output.py at line 238]  Output to directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu done. Type "launch" to generate events from this process, or see /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/README Run "open index.html" to see more information about this process. quit -real 0m2.033s -user 0m1.688s -sys 0m0.223s +real 0m1.925s +user 0m1.621s +sys 0m0.231s Code generation completed in 2 seconds ************************************************************ * * diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/CurandRandomNumberKernel.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/CurandRandomNumberKernel.cc index 08a16f6f2c..c160c5e06b 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/CurandRandomNumberKernel.cc +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/CurandRandomNumberKernel.cc @@ -10,6 +10,7 @@ #include #ifndef MGONGPU_HAS_NO_CURAND /* clang-format off */ +// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when MGONGPUCPP_GPUIMPL is not defined #include "curand.h" #define checkCurand( code ){ assertCurand( code, __FILE__, __LINE__ ); } inline void assertCurand( curandStatus_t code, const char *file, int line, bool abort = true ) diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/HiprandRandomNumberKernel.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/HiprandRandomNumberKernel.cc new file mode 100644 index 0000000000..2e4534f9d4 --- /dev/null +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/HiprandRandomNumberKernel.cc @@ -0,0 +1,145 @@ +// Copyright (C) 2020-2024 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Jan 2024) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2024) for the MG5aMC CUDACPP plugin. + +#include "mgOnGpuConfig.h" + +#include "GpuRuntime.h" +#include "MemoryBuffers.h" +#include "RandomNumberKernels.h" + +#include + +#ifndef MGONGPU_HAS_NO_HIPRAND /* clang-format off */ +#ifndef __HIP_PLATFORM_AMD__ +#define __HIP_PLATFORM_AMD__ 1 // enable hiprand for AMD (rocrand) +#endif +#include +#define checkHiprand( code ){ assertHiprand( code, __FILE__, __LINE__ ); } +inline void assertHiprand( hiprandStatus_t code, const char *file, int line, bool abort = true ) +{ + if ( code != HIPRAND_STATUS_SUCCESS ) + { + printf( "HiprandAssert: %s:%d code=%d\n", file, line, code ); + if ( abort ) assert( code == HIPRAND_STATUS_SUCCESS ); + } +} +#endif /* clang-format on */ + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + //-------------------------------------------------------------------------- +#ifndef MGONGPU_HAS_NO_HIPRAND + HiprandRandomNumberKernel::HiprandRandomNumberKernel( BufferRndNumMomenta& rnarray, const bool onDevice ) + : RandomNumberKernelBase( rnarray ) + , m_isOnDevice( onDevice ) + { + if( m_isOnDevice ) + { +#ifdef MGONGPUCPP_GPUIMPL + if( !m_rnarray.isOnDevice() ) + throw std::runtime_error( "HiprandRandomNumberKernel on device with a host random number array" ); +#else + throw std::runtime_error( "HiprandRandomNumberKernel does not support HiprandDevice on CPU host" ); +#endif + } + else + { + if( m_rnarray.isOnDevice() ) + throw std::runtime_error( "HiprandRandomNumberKernel on host with a device random number array" ); + } + createGenerator(); + } + + //-------------------------------------------------------------------------- + + HiprandRandomNumberKernel::~HiprandRandomNumberKernel() + { + destroyGenerator(); + } + + //-------------------------------------------------------------------------- + + void HiprandRandomNumberKernel::seedGenerator( const unsigned int seed ) + { + if( m_isOnDevice ) + { + destroyGenerator(); // workaround for #429 + createGenerator(); // workaround for #429 + } + //printf( "seedGenerator: seed %d\n", seed ); + checkHiprand( hiprandSetPseudoRandomGeneratorSeed( m_rnGen, seed ) ); + } + + //-------------------------------------------------------------------------- + + void HiprandRandomNumberKernel::createGenerator() + { + //const hiprandRngType_t type = HIPRAND_RNG_PSEUDO_DEFAULT; + //const hiprandRngType_t type = HIPRAND_RNG_PSEUDO_XORWOW; + //const hiprandRngType_t type = HIPRAND_RNG_PSEUDO_MRG32K3A; + const hiprandRngType_t type = HIPRAND_RNG_PSEUDO_MTGP32; // same as curand; not implemented yet (code=1000) in host code + //const hiprandRngType_t type = HIPRAND_RNG_PSEUDO_MT19937; + //const hiprandRngType_t type = HIPRAND_RNG_PSEUDO_PHILOX4_32_10; + if( m_isOnDevice ) + { + checkHiprand( hiprandCreateGenerator( &m_rnGen, type ) ); + } + else + { + // See https://github.com/ROCm/hipRAND/issues/76 + throw std::runtime_error( "HiprandRandomNumberKernel on host is not supported yet (hiprandCreateGeneratorHost is not implemented yet)" ); + //checkHiprand( hiprandCreateGeneratorHost( &m_rnGen, type ) ); // ALWAYS FAILS WITH CODE=1000 + } + // FIXME: hiprand ordering is not implemented yet + // See https://github.com/ROCm/hipRAND/issues/75 + /* + //checkHiprand( hiprandSetGeneratorOrdering( *&m_rnGen, HIPRAND_ORDERING_PSEUDO_LEGACY ) ); + checkHiprand( hiprandSetGeneratorOrdering( *&m_rnGen, HIPRAND_ORDERING_PSEUDO_BEST ) ); + //checkHiprand( hiprandSetGeneratorOrdering( *&m_rnGen, HIPRAND_ORDERING_PSEUDO_DYNAMIC ) ); + //checkHiprand( hiprandSetGeneratorOrdering( *&m_rnGen, HIPRAND_ORDERING_PSEUDO_SEEDED ) ); + */ + } + + //-------------------------------------------------------------------------- + + void HiprandRandomNumberKernel::destroyGenerator() + { + checkHiprand( hiprandDestroyGenerator( m_rnGen ) ); + } + + //-------------------------------------------------------------------------- + + void HiprandRandomNumberKernel::generateRnarray() + { +#if defined MGONGPU_FPTYPE_DOUBLE + checkHiprand( hiprandGenerateUniformDouble( m_rnGen, m_rnarray.data(), m_rnarray.size() ) ); +#elif defined MGONGPU_FPTYPE_FLOAT + checkHiprand( hiprandGenerateUniform( m_rnGen, m_rnarray.data(), m_rnarray.size() ) ); +#endif + /* + printf( "\nHiprandRandomNumberKernel::generateRnarray size = %d\n", (int)m_rnarray.size() ); + fptype* data = m_rnarray.data(); +#ifdef MGONGPUCPP_GPUIMPL + if( m_rnarray.isOnDevice() ) + { + data = new fptype[m_rnarray.size()](); + checkCuda( cudaMemcpy( data, m_rnarray.data(), m_rnarray.bytes(), cudaMemcpyDeviceToHost ) ); + } +#endif + for( int i = 0; i < ( (int)m_rnarray.size() / 4 ); i++ ) + printf( "[%4d] %f %f %f %f\n", i * 4, data[i * 4], data[i * 4 + 2], data[i * 4 + 2], data[i * 4 + 3] ); +#ifdef MGONGPUCPP_GPUIMPL + if( m_rnarray.isOnDevice() ) delete[] data; +#endif + */ + } + + //-------------------------------------------------------------------------- +#endif +} diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/HiprandRandomNumberKernel.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/HiprandRandomNumberKernel.cc new file mode 120000 index 0000000000..6691864f78 --- /dev/null +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/HiprandRandomNumberKernel.cc @@ -0,0 +1 @@ +../HiprandRandomNumberKernel.cc \ No newline at end of file diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/check_sa.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/check_sa.cc index bde384c69e..e086ae12d9 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/check_sa.cc +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/check_sa.cc @@ -1,10 +1,10 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2023 CERN and UCLouvain. +// Copyright (C) 2020-2024 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -58,7 +58,7 @@ int usage( char* argv0, int ret = 1 ) { std::cout << "Usage: " << argv0 - << " [--verbose|-v] [--debug|-d] [--performance|-p] [--json|-j] [--curhst|--curdev|--common] [--rmbhst|--rmbdev] [--bridge]" + << " [--verbose|-v] [--debug|-d] [--performance|-p] [--json|-j] [--curhst|--curdev|--hirhst|--hirdev|--common] [--rmbhst|--rmbdev] [--bridge]" << " [#gpuBlocksPerGrid #gpuThreadsPerBlock] #iterations" << std::endl; std::cout << std::endl; std::cout << "The number of events per iteration is #gpuBlocksPerGrid * #gpuThreadsPerBlock" << std::endl; @@ -131,17 +131,31 @@ main( int argc, char** argv ) enum class RandomNumberMode { CommonRandom = 0, - CurandHost = 1, - CurandDevice = 2 + CurandHost = -1, + CurandDevice = 1, + HiprandHost = -2, + HiprandDevice = 2 }; -#ifdef MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785) -#elif defined __HIPCC__ -#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random -#elif defined __CUDACC__ +#if defined __CUDACC__ +#ifndef MGONGPU_HAS_NO_CURAND RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand #else + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on NVidia GPU if build has no curand (PR #784 and #785) +#endif +#elif defined __HIPCC__ +#ifndef MGONGPU_HAS_NO_HIPRAND + RandomNumberMode rndgen = RandomNumberMode::HiprandDevice; // default on AMD GPU if build has hiprand +#else + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on AMD GPU if build has no hiprand +#endif +#else +#ifndef MGONGPU_HAS_NO_CURAND RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand +#elif not defined MGONGPU_HAS_NO_HIPRAND + RandomNumberMode rndgen = RandomNumberMode::HiprandDevice; // default on CPU if build has hiprand +#else + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has neither curand nor hiprand +#endif #endif // Rambo sampling mode (NB RamboHost implies CommonRandom or CurandHost!) enum class RamboSamplingMode @@ -152,7 +166,7 @@ main( int argc, char** argv ) #ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -193,6 +207,26 @@ main( int argc, char** argv ) throw std::runtime_error( "CurandHost is not supported because this application was built without Curand support" ); #else rndgen = RandomNumberMode::CurandHost; +#endif + } + else if( arg == "--hirdev" ) + { +#ifndef __HIPCC__ + throw std::runtime_error( "HiprandDevice is not supported on CPUs or non-AMD GPUs" ); +#elif defined MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "HiprandDevice is not supported because this application was built without Hiprand support" ); +#else + rndgen = RandomNumberMode::HiprandDevice; +#endif + } + else if( arg == "--hirhst" ) + { +#ifdef MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "HiprandHost is not supported because this application was built without Hiprand support" ); +#else + // See https://github.com/ROCm/hipRAND/issues/76 + throw std::runtime_error( "HiprandRandomNumberKernel on host is not supported yet (hiprandCreateGeneratorHost is not implemented yet)" ); + //rndgen = RandomNumberMode::HiprandHost; #endif } else if( arg == "--common" ) @@ -265,6 +299,20 @@ main( int argc, char** argv ) #endif } + if( rmbsmp == RamboSamplingMode::RamboHost && rndgen == RandomNumberMode::HiprandDevice ) + { +#if not defined MGONGPU_HAS_NO_HIPRAND + // See https://github.com/ROCm/hipRAND/issues/76 + //std::cout << "WARNING! RamboHost selected: cannot use HiprandDevice, will use HiprandHost" << std::endl; + //rndgen = RandomNumberMode::HiprandHost; + std::cout << "WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet)" << std::endl; + rndgen = RandomNumberMode::CommonRandom; +#else + std::cout << "WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom" << std::endl; + rndgen = RandomNumberMode::CommonRandom; +#endif + } + constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout constexpr int neppR = MemoryAccessRandomNumbers::neppR; // AOSOA layout @@ -415,7 +463,7 @@ main( int argc, char** argv ) std::unique_ptr wavetimes( new double[niter] ); std::unique_ptr wv3atimes( new double[niter] ); - // --- 0c. Create curand or common generator + // --- 0c. Create curand, hiprand or common generator const std::string cgenKey = "0c GenCreat"; timermap.start( cgenKey ); // Allocate the appropriate RandomNumberKernel @@ -433,7 +481,7 @@ main( int argc, char** argv ) prnk.reset( new CurandRandomNumberKernel( hstRndmom, onDevice ) ); #endif } - else + else if( rndgen == RandomNumberMode::CurandDevice ) { #ifdef MGONGPU_HAS_NO_CURAND throw std::runtime_error( "INTERNAL ERROR! CurandDevice is not supported because this application was built without Curand support" ); // INTERNAL ERROR (no path to this statement) @@ -441,9 +489,31 @@ main( int argc, char** argv ) const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); #else - throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif } + else if( rndgen == RandomNumberMode::HiprandHost ) + { +#ifdef MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "INTERNAL ERROR! HiprandHost is not supported because this application was built without Hiprand support" ); // INTERNAL ERROR (no path to this statement) +#else + const bool onDevice = false; + prnk.reset( new HiprandRandomNumberKernel( hstRndmom, onDevice ) ); +#endif + } + else if( rndgen == RandomNumberMode::HiprandDevice ) + { +#ifdef MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "INTERNAL ERROR! HiprandDevice is not supported because this application was built without Hiprand support" ); // INTERNAL ERROR (no path to this statement) +#elif defined __HIPCC__ + const bool onDevice = true; + prnk.reset( new HiprandRandomNumberKernel( devRndmom, onDevice ) ); +#else + throw std::logic_error( "INTERNAL ERROR! HiprandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) +#endif + } + else + throw std::logic_error( "INTERNAL ERROR! Unknown rndgen value?" ); // INTERNAL ERROR (no path to this statement) // --- 0c. Create rambo sampling kernel [keep this in 0c for the moment] std::unique_ptr prsk; @@ -497,7 +567,7 @@ main( int argc, char** argv ) // *** START THE OLD-STYLE TIMER FOR RANDOM GEN *** double genrtime = 0; - // --- 1a. Seed rnd generator (to get same results on host and device in curand) + // --- 1a. Seed rnd generator (to get same results on host and device in curand/hiprand) // [NB This should not be necessary using the host API: "Generation functions // can be called multiple times on the same generator to generate successive // blocks of results. For pseudorandom generators, multiple calls to generation @@ -515,7 +585,9 @@ main( int argc, char** argv ) //std::cout << "Got random numbers" << std::endl; #ifdef MGONGPUCPP_GPUIMPL - if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) + if( rndgen != RandomNumberMode::CurandDevice && + rndgen != RandomNumberMode::HiprandDevice && + rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device const std::string htodKey = "1c CpHTDrnd"; @@ -761,6 +833,10 @@ main( int argc, char** argv ) rndgentxt = "CURAND HOST"; else if( rndgen == RandomNumberMode::CurandDevice ) rndgentxt = "CURAND DEVICE"; + else if( rndgen == RandomNumberMode::HiprandHost ) + rndgentxt = "ROCRAND HOST"; + else if( rndgen == RandomNumberMode::HiprandDevice ) + rndgentxt = "ROCRAND DEVICE"; #ifdef __CUDACC__ rndgentxt += " (CUDA code)"; #elif defined __HIPCC__ @@ -788,7 +864,7 @@ main( int argc, char** argv ) wrkflwtxt += "FLT+"; #else wrkflwtxt += "???+"; // no path to this statement -#endif /* clang-format on */ +#endif // -- CUCOMPLEX or THRUST or STD or CXSIMPLE complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -813,7 +889,7 @@ main( int argc, char** argv ) wrkflwtxt += "CXS:"; #else wrkflwtxt += "???:"; // no path to this statement -#endif +#endif /* clang-format on */ #endif // -- COMMON or CURAND HOST or CURAND DEVICE random numbers? if( rndgen == RandomNumberMode::CommonRandom ) @@ -822,6 +898,10 @@ main( int argc, char** argv ) wrkflwtxt += "CURHST+"; else if( rndgen == RandomNumberMode::CurandDevice ) wrkflwtxt += "CURDEV+"; + else if( rndgen == RandomNumberMode::HiprandHost ) + wrkflwtxt += "HIRHST+"; + else if( rndgen == RandomNumberMode::HiprandDevice ) + wrkflwtxt += "HIRDEV+"; else wrkflwtxt += "??????+"; // no path to this statement // -- HOST or DEVICE rambo sampling? @@ -1099,7 +1179,7 @@ main( int argc, char** argv ) #ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif - << "\"Curand generation\": " + << "\"Random generation\": " << "\"" << rndgentxt << "\"," << std::endl; double minelem = hstStats.minME; diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/RandomNumberKernels.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/RandomNumberKernels.h index 21d63beeac..7ed728a26c 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/RandomNumberKernels.h +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/RandomNumberKernels.h @@ -1,21 +1,22 @@ -// Copyright (C) 2020-2023 CERN and UCLouvain. +// Copyright (C) 2020-2024 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. #ifndef RANDOMNUMBERKERNELS_H #define RANDOMNUMBERKERNELS_H 1 #include "mgOnGpuConfig.h" -// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when MGONGPUCPP_GPUIMPL is not defined -#ifndef MGONGPU_HAS_NO_CURAND -//#include "curand.h" -struct curandGenerator_st; // forward definition from curand.h -#endif - #include "MemoryBuffers.h" +// Forward definition from curand.h (the full header is only needed in CurandRandomKernel.cc) +struct curandGenerator_st; + +// Forward definition from hiprand.h (the full header is only needed in HiprandRandomKernel.cc) +struct rocrand_generator_base_type; +typedef rocrand_generator_base_type hiprandGenerator_st; + #ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else @@ -107,7 +108,6 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef MGONGPU_HAS_NO_CURAND // A class encapsulating CURAND random number generation on a CPU host or on a GPU device class CurandRandomNumberKernel final : public RandomNumberKernelBase { @@ -142,11 +142,49 @@ namespace mg5amcCpu const bool m_isOnDevice; // The curand generator - // (NB: curand.h defines typedef generator_t as a pointer to forward-defined 'struct curandGenerator_st') + // (NB: curand.h defines typedef curandGenerator_t as a pointer to forward-defined 'struct curandGenerator_st') curandGenerator_st* m_rnGen; }; -#endif + //-------------------------------------------------------------------------- + + // A class encapsulating HIPRAND random number generation on a CPU host or on a GPU device + class HiprandRandomNumberKernel final : public RandomNumberKernelBase + { + public: + + // Constructor from an existing output buffer + HiprandRandomNumberKernel( BufferRndNumMomenta& rnarray, const bool onDevice ); + + // Destructor + ~HiprandRandomNumberKernel(); + + // Seed the random number generator + void seedGenerator( const unsigned int seed ) override final; + + // Generate the random number array + void generateRnarray() override final; + + // Is this a host or device kernel? + bool isOnDevice() const override final { return m_isOnDevice; } + + private: + + // Create the generator (workaround for #429: do this in every seedGenerator call rather than only in the ctor) + void createGenerator(); + + // Destroy the generator (workaround for #429: do this in every seedGenerator call rather than only in the ctor) + void destroyGenerator(); + + private: + + // Is this a host or device kernel? + const bool m_isOnDevice; + + // The hiprand generator + // (NB: hiprand.h defines typedef hiprandGenerator_t as a pointer to forward-defined 'struct hiprandGenerator_st') + hiprandGenerator_st* m_rnGen; + }; //-------------------------------------------------------------------------- } diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk index 1077bdc098..3ad91dfd59 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk @@ -173,11 +173,6 @@ ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) comma:=, CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch)) CUINC = -I$(CUDA_HOME)/include/ - ifeq ($(RNDGEN),hasNoCurand) - CURANDLIBFLAGS= - else - CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here! - endif CUOPTFLAGS = -lineinfo ###GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math GPUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math @@ -241,7 +236,7 @@ else override GPUCC= override USE_NVTX= override CUINC= - override CURANDLIBFLAGS= + override HIPINC= endif @@ -291,7 +286,7 @@ endif #------------------------------------------------------------------------------- -#=== Configure defaults and check if user-defined choices exist for OMPFLAGS, AVX, FPTYPE, HELINL, HRDCOD, RNDGEN +#=== Configure defaults and check if user-defined choices exist for OMPFLAGS, AVX, FPTYPE, HELINL, HRDCOD # Set the default OMPFLAGS choice ifneq ($(findstring hipcc,$(GPUCC)),) @@ -352,29 +347,62 @@ ifeq ($(HRDCOD),) override HRDCOD = 0 endif -# Set the default RNDGEN (random number generator) choice -ifeq ($(RNDGEN),) - ifeq ($(GPUCC),) - override RNDGEN = hasNoCurand - # Edgecase for HIP compilation - else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) - override RNDGEN = hasNoCurand - else ifeq ($(RNDGEN),) - override RNDGEN = hasCurand - endif -endif - -# Export AVX, FPTYPE, HELINL, HRDCOD, RNDGEN, OMPFLAGS so that it is not necessary to pass them to the src Makefile too +# Export AVX, FPTYPE, HELINL, HRDCOD, OMPFLAGS so that it is not necessary to pass them to the src Makefile too export AVX export FPTYPE export HELINL export HRDCOD -export RNDGEN export OMPFLAGS #------------------------------------------------------------------------------- -#=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN +#=== Configure defaults and check if user-defined choices exist for RNDGEN (legacy!), HASCURAND, HASHIPRAND + +# If the legacy RNDGEN exists, this take precedence over any HASCURAND choice (but a warning is printed out) +###$(info RNDGEN=$(RNDGEN)) +ifneq ($(RNDGEN),) + $(warning Environment variable RNDGEN is no longer supported, please use HASCURAND instead!) + ifeq ($(RNDGEN),hasCurand) + override HASCURAND = $(RNDGEN) + else ifeq ($(RNDGEN),hasNoCurand) + override HASCURAND = $(RNDGEN) + else ifneq ($(RNDGEN),hasNoCurand) + $(error Unknown RNDGEN='$(RNDGEN)': only 'hasCurand' and 'hasNoCurand' are supported - but use HASCURAND instead!) + endif +endif + +# Set the default HASCURAND (curand random number generator) choice, if no prior choice exists for HASCURAND +# (NB: allow HASCURAND=hasCurand even if $(GPUCC) does not point to nvcc: assume CUDA_HOME was defined correctly...) +ifeq ($(HASCURAND),) + ifeq ($(GPUCC),) # CPU-only build + override HASCURAND = hasNoCurand + else ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build + override HASCURAND = hasCurand + else # non-Nvidia GPU build + override HASCURAND = hasNoCurand + endif +endif + +# Set the default HASHIPRAND (hiprand random number generator) choice, if no prior choice exists for HASHIPRAND +# (NB: allow HASHIPRAND=hasHiprand even if $(GPUCC) does not point to hipcc: assume HIP_HOME was defined correctly...) +ifeq ($(HASHIPRAND),) + ifeq ($(GPUCC),) # CPU-only build + override HASHIPRAND = hasNoHiprand + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build + override HASHIPRAND = hasHiprand + else # non-AMD GPU build + override HASHIPRAND = hasNoHiprand + endif +endif + +# Export HASCURAND, HASHIPRAND so that it is not necessary to pass them to the src Makefile too +# (NB: these variables in cudacpp_src.mk are only used to define the build tag, they are NOT needed for RNDCXXFLAGS or RNDLIBFLAGS) +export HASCURAND +export HASHIPRAND + +#------------------------------------------------------------------------------- + +#=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD # Set the build flags appropriate to OMPFLAGS $(info OMPFLAGS=$(OMPFLAGS)) @@ -432,13 +460,13 @@ CXXFLAGS+= $(AVXFLAGS) $(info FPTYPE=$(FPTYPE)) ifeq ($(FPTYPE),d) CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE - GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE + GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE else ifeq ($(FPTYPE),f) CXXFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT - GPUFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT + GPUFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT else ifeq ($(FPTYPE),m) CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT - GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT + GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT else $(error Unknown FPTYPE='$(FPTYPE)': only 'd', 'f' and 'm' are supported) endif @@ -447,7 +475,7 @@ endif $(info HELINL=$(HELINL)) ifeq ($(HELINL),1) CXXFLAGS += -DMGONGPU_INLINE_HELAMPS - GPUFLAGS += -DMGONGPU_INLINE_HELAMPS + GPUFLAGS += -DMGONGPU_INLINE_HELAMPS else ifneq ($(HELINL),0) $(error Unknown HELINL='$(HELINL)': only '0' and '1' are supported) endif @@ -456,21 +484,40 @@ endif $(info HRDCOD=$(HRDCOD)) ifeq ($(HRDCOD),1) CXXFLAGS += -DMGONGPU_HARDCODE_PARAM - GPUFLAGS += -DMGONGPU_HARDCODE_PARAM + GPUFLAGS += -DMGONGPU_HARDCODE_PARAM else ifneq ($(HRDCOD),0) $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported) endif -# Set the build flags appropriate to each RNDGEN choice (example: "make RNDGEN=hasNoCurand") -$(info RNDGEN=$(RNDGEN)) -ifeq ($(RNDGEN),hasNoCurand) - override CXXFLAGSCURAND = -DMGONGPU_HAS_NO_CURAND -else ifeq ($(RNDGEN),hasCurand) - override CXXFLAGSCURAND = + +#=== Set the CUDA/HIP/C++ compiler and linker flags appropriate to user-defined choices of HASCURAND, HASHIPRAND + +$(info HASCURAND=$(HASCURAND)) +$(info HASHIPRAND=$(HASHIPRAND)) +override RNDCXXFLAGS= +override RNDLIBFLAGS= + +# Set the RNDCXXFLAGS and RNDLIBFLAGS build flags appropriate to each HASCURAND choice (example: "make HASCURAND=hasNoCurand") +ifeq ($(HASCURAND),hasNoCurand) + override RNDCXXFLAGS += -DMGONGPU_HAS_NO_CURAND +else ifeq ($(HASCURAND),hasCurand) + override RNDLIBFLAGS += -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here! else - $(error Unknown RNDGEN='$(RNDGEN)': only 'hasCurand' and 'hasNoCurand' are supported) + $(error Unknown HASCURAND='$(HASCURAND)': only 'hasCurand' and 'hasNoCurand' are supported) +endif + +# Set the RNDCXXFLAGS and RNDLIBFLAGS build flags appropriate to each HASHIPRAND choice (example: "make HASHIPRAND=hasNoHiprand") +ifeq ($(HASHIPRAND),hasNoHiprand) + override RNDCXXFLAGS += -DMGONGPU_HAS_NO_HIPRAND +else ifeq ($(HASHIPRAND),hasHiprand) + override RNDLIBFLAGS += -L$(HIP_HOME)/lib/ -lhiprand +else ifneq ($(HASHIPRAND),hasHiprand) + $(error Unknown HASHIPRAND='$(HASHIPRAND)': only 'hasHiprand' and 'hasNoHiprand' are supported) endif +#$(info RNDCXXFLAGS=$(RNDCXXFLAGS)) +#$(info HASHIPRAND=$(HASHIPRAND)) + #------------------------------------------------------------------------------- #=== Configure build directories and build lockfiles === @@ -481,7 +528,7 @@ override DIRTAG = $(AVX)_$(FPTYPE)_inl$(HELINL)_hrd$(HRDCOD) # Build lockfile "full" tag (defines full specification of build options that cannot be intermixed) # (Rationale: avoid mixing of CUDA and no-CUDA environment builds with different random number generators) -override TAG = $(AVX)_$(FPTYPE)_inl$(HELINL)_hrd$(HRDCOD)_$(RNDGEN) +override TAG = $(AVX)_$(FPTYPE)_inl$(HELINL)_hrd$(HRDCOD)_$(HASCURAND)_$(HASHIPRAND) # Build directory: current directory by default, or build.$(DIRTAG) if USEBUILDDIR==1 ifeq ($(USEBUILDDIR),1) @@ -589,14 +636,19 @@ endif $(BUILDDIR)/check_sa.o: CXXFLAGS += $(USE_NVTX) $(CUINC) $(BUILDDIR)/check_sa_cu.o: CXXFLAGS += $(USE_NVTX) $(CUINC) -# Apply special build flags only to check_sa[_cu].o and CurandRandomNumberKernel[_cu].o (curand headers, #679) -$(BUILDDIR)/check_sa.o: CXXFLAGS += $(CXXFLAGSCURAND) -$(BUILDDIR)/check_sa_cu.o: CUFLAGS += $(CXXFLAGSCURAND) -$(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CXXFLAGSCURAND) -$(BUILDDIR)/CurandRandomNumberKernel_cu.o: CUFLAGS += $(CXXFLAGSCURAND) -ifeq ($(RNDGEN),hasCurand) +# Apply special build flags only to check_sa[_cu].o and (Cu|Hip)randRandomNumberKernel[_cu].o +$(BUILDDIR)/check_sa.o: CXXFLAGS += $(RNDCXXFLAGS) +$(BUILDDIR)/check_sa_cu.o: CUFLAGS += $(RNDCXXFLAGS) +$(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(RNDCXXFLAGS) +$(BUILDDIR)/CurandRandomNumberKernel_cu.o: CUFLAGS += $(RNDCXXFLAGS) +$(BUILDDIR)/HiprandRandomNumberKernel.o: CXXFLAGS += $(RNDCXXFLAGS) +$(BUILDDIR)/HiprandRandomNumberKernel_cu.o: CUFLAGS += $(RNDCXXFLAGS) +ifeq ($(HASCURAND),hasCurand) # curand headers, #679 $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUINC) endif +ifeq ($(HASHIPRAND),hasHiprand) # hiprand headers +$(BUILDDIR)/HiprandRandomNumberKernel.o: CXXFLAGS += $(HIPINC) +endif # Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592) ifneq ($(shell $(CXX) --version | egrep '^(Intel)'),) @@ -673,8 +725,8 @@ endif # Target (and build rules): C++ and CUDA standalone executables $(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o - $(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CURANDLIBFLAGS) +$(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(BUILDDIR)/HiprandRandomNumberKernel.o + $(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(BUILDDIR)/HiprandRandomNumberKernel.o $(RNDLIBFLAGS) ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) @@ -684,8 +736,8 @@ else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(cu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif $(cu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(cu_main): $(BUILDDIR)/check_sa_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_cu.o - $(GPUCC) -o $@ $(BUILDDIR)/check_sa_cu.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_cu.o $(CURANDLIBFLAGS) +$(cu_main): $(BUILDDIR)/check_sa_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_cu.o $(BUILDDIR)/HiprandRandomNumberKernel_cu.o + $(GPUCC) -o $@ $(BUILDDIR)/check_sa_cu.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_cu.o $(BUILDDIR)/HiprandRandomNumberKernel_cu.o $(RNDLIBFLAGS) endif #------------------------------------------------------------------------------- diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/runTest.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/runTest.cc index de327f2321..7f248d29a4 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/runTest.cc +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/runTest.cc @@ -238,7 +238,7 @@ struct CUDATest : public CUDA_CPU_TestBase return MemoryAccessMatrixElements::ieventAccessConst( hstMatrixElements.data(), ievt ); } }; -#endif +#endif /* clang-format off */ // Use two levels of macros to force stringification at the right level // (see https://gcc.gnu.org/onlinedocs/gcc-3.0.1/cpp_3.html#SEC17 and https://stackoverflow.com/a/3419392) @@ -260,4 +260,4 @@ INSTANTIATE_TEST_SUITE_P( prefix, \ MG_INSTANTIATE_TEST_SUITE_GPU( XTESTID_GPU( MG_EPOCH_PROCESS_ID ), MadgraphTest ); #else MG_INSTANTIATE_TEST_SUITE_CPU( XTESTID_CPU( MG_EPOCH_PROCESS_ID ), MadgraphTest ); -#endif +#endif /* clang-format on */ diff --git a/epochX/cudacpp/ee_mumu.mad/src/cudacpp_src.mk b/epochX/cudacpp/ee_mumu.mad/src/cudacpp_src.mk index b2b9da5288..d7d51259d2 100644 --- a/epochX/cudacpp/ee_mumu.mad/src/cudacpp_src.mk +++ b/epochX/cudacpp/ee_mumu.mad/src/cudacpp_src.mk @@ -45,13 +45,13 @@ endif #------------------------------------------------------------------------------- -#=== Configure the CUDA compiler (note: GPUCC is already exported including ccache) +#=== Configure the CUDA compiler (note: GPUCC have been exported from cudacpp.mk including ccache) ###$(info GPUCC=$(GPUCC)) #------------------------------------------------------------------------------- -#=== Configure ccache for C++ builds (note: GPUCC is already exported including ccache) +#=== Configure ccache for C++ builds (note: GPUCC have been exported from cudacpp.mk including ccache) # Enable ccache if USECCACHE=1 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) @@ -86,7 +86,8 @@ endif #------------------------------------------------------------------------------- -#=== Set the CUDA/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN +#=== Set the CUDA/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD (exported from cudacpp.mk) +#=== (NB the RNDCXXFLAGS and RNDLIBFLAGS appropriate to user-defined choices of HASCURAND and HASHIPRAND have been exported from cudacpp.mk) # Set the build flags appropriate to OMPFLAGS ###$(info OMPFLAGS=$(OMPFLAGS)) @@ -175,14 +176,6 @@ else ifneq ($(HRDCOD),0) $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported) endif -# Set the build flags appropriate to each RNDGEN choice (example: "make RNDGEN=hasNoCurand") -###$(info RNDGEN=$(RNDGEN)) -ifeq ($(RNDGEN),hasNoCurand) - CXXFLAGS += -DMGONGPU_HAS_NO_CURAND -else ifneq ($(RNDGEN),hasCurand) - $(error Unknown RNDGEN='$(RNDGEN)': only 'hasCurand' and 'hasNoCurand' are supported) -endif - #------------------------------------------------------------------------------- #=== Configure build directories and build lockfiles === @@ -193,7 +186,7 @@ override DIRTAG = $(AVX)_$(FPTYPE)_inl$(HELINL)_hrd$(HRDCOD) # Build lockfile "full" tag (defines full specification of build options that cannot be intermixed) # (Rationale: avoid mixing of CUDA and no-CUDA environment builds with different random number generators) -override TAG = $(AVX)_$(FPTYPE)_inl$(HELINL)_hrd$(HRDCOD)_$(RNDGEN) +override TAG = $(AVX)_$(FPTYPE)_inl$(HELINL)_hrd$(HRDCOD)_$(HASCURAND)_$(HASHIPRAND) # Build directory: current directory by default, or build.$(DIRTAG) if USEBUILDDIR==1 ###$(info Current directory is $(shell pwd)) diff --git a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h index 6bde4466d0..33306bc3e2 100644 --- a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h @@ -20,11 +20,15 @@ #undef MGONGPUCPP_GPUIMPL #endif +// Make sure that __HIP_PLATFORM_NVIDIA__ is undefined +// (__HIP_PLATFORM_AMD__ is defined by hipcc or in HiprandRandomNumberKernel.cc) +#undef __HIP_PLATFORM_NVIDIA__ // disable hiprand for NVidia (curand) + // ** NB1 Throughputs (e.g. 6.8E8) are events/sec for "./gcheck.exe -p 65536 128 12" // ** NB2 Baseline on b7g47n0004 fluctuates (probably depends on load on other VMs) // Choose if curand is supported for generating random numbers -// For HIP, by default, do not use curand (common random numbers will be used instead) +// For HIP, by default, do not allow curand to be used (hiprand or common random numbers will be used instead) // For both CUDA and C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND // (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784 and #785) #if defined __HIPCC__ @@ -39,6 +43,22 @@ //#endif #endif +// Choose if hiprand is supported for generating random numbers +// For CUDA, by default, do not allow hiprand to be used (curand or common random numbers will be used instead) +// For both HIP and C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_HIPRAND +// (there may exist HIP installations which do not include hiprand?) +#if defined __CUDACC__ +#define MGONGPU_HAS_NO_HIPRAND 1 +#else +//#ifdef __HIPCC__ +//#undef MGONGPU_HAS_NO_HIPRAND // default +////#define MGONGPU_HAS_NO_HIPRAND 1 +//#else +//#undef MGONGPU_HAS_NO_HIPRAND // default +////#define MGONGPU_HAS_NO_HIPRAND 1 +//#endif +#endif + // Choose floating point precision (for everything but color algebra #537) // If one of these macros has been set from outside with e.g. -DMGONGPU_FPTYPE_FLOAT, nothing happens (issue #167) #if not defined MGONGPU_FPTYPE_DOUBLE and not defined MGONGPU_FPTYPE_FLOAT diff --git a/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt b/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt index a0e7cdc747..0ca2931a2d 100644 --- a/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt +++ b/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt @@ -62,7 +62,7 @@ generate e+ e- > mu+ mu- No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005625724792480469  +DEBUG: model prefixing takes 0.005585908889770508  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -154,7 +154,7 @@ INFO: Checking for minimal orders which gives processes. INFO: Please specify coupling orders to bypass this step. INFO: Trying process: e+ e- > mu+ mu- WEIGHTED<=4 @1 INFO: Process has 2 diagrams -1 processes with 2 diagrams generated in 0.005 s +1 processes with 2 diagrams generated in 0.004 s Total: 1 processes with 2 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_ee_mumu Load PLUGIN.CUDACPP_OUTPUT @@ -162,28 +162,28 @@ Load PLUGIN.CUDACPP_OUTPUT It has been validated for the last time with version: 3.5.2 Output will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: cformat =  plugin [export_cpp.py at line 3071]  -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 160]  -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 165]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 161]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 166]  INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: e+ e- > mu+ mu- WEIGHTED<=4 @1 INFO: Processing color information for process: e+ e- > mu+ mu- @1 -DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 194]  -DEBUG: type(subproc_group)= [output.py at line 195]  -DEBUG: type(fortran_model)= [output.py at line 196]  -DEBUG: type(me)= me=0 [output.py at line 197]  +DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 195]  +DEBUG: type(subproc_group)= [output.py at line 196]  +DEBUG: type(fortran_model)= [output.py at line 197]  +DEBUG: type(me)= me=0 [output.py at line 198]  INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/./CPPProcess.h FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/./CPPProcess.cc INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/. Generated helas calls for 1 subprocesses (2 diagrams) in 0.004 s -DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202]  +DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 203]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFV2 routines ALOHA: aloha creates FFV4 routines ALOHA: aloha creates FFV2_4 routines -ALOHA: aloha creates 4 routines in 0.276 s +ALOHA: aloha creates 4 routines in 0.267 s FFV1 FFV1 FFV2 @@ -202,7 +202,7 @@ INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/. quit -real 0m0.681s -user 0m0.609s -sys 0m0.063s +real 0m0.705s +user 0m0.594s +sys 0m0.053s Code generation completed in 1 seconds diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/CurandRandomNumberKernel.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/CurandRandomNumberKernel.cc index 08a16f6f2c..c160c5e06b 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/CurandRandomNumberKernel.cc +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/CurandRandomNumberKernel.cc @@ -10,6 +10,7 @@ #include #ifndef MGONGPU_HAS_NO_CURAND /* clang-format off */ +// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when MGONGPUCPP_GPUIMPL is not defined #include "curand.h" #define checkCurand( code ){ assertCurand( code, __FILE__, __LINE__ ); } inline void assertCurand( curandStatus_t code, const char *file, int line, bool abort = true ) diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/HiprandRandomNumberKernel.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/HiprandRandomNumberKernel.cc new file mode 100644 index 0000000000..2e4534f9d4 --- /dev/null +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/HiprandRandomNumberKernel.cc @@ -0,0 +1,145 @@ +// Copyright (C) 2020-2024 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Jan 2024) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2024) for the MG5aMC CUDACPP plugin. + +#include "mgOnGpuConfig.h" + +#include "GpuRuntime.h" +#include "MemoryBuffers.h" +#include "RandomNumberKernels.h" + +#include + +#ifndef MGONGPU_HAS_NO_HIPRAND /* clang-format off */ +#ifndef __HIP_PLATFORM_AMD__ +#define __HIP_PLATFORM_AMD__ 1 // enable hiprand for AMD (rocrand) +#endif +#include +#define checkHiprand( code ){ assertHiprand( code, __FILE__, __LINE__ ); } +inline void assertHiprand( hiprandStatus_t code, const char *file, int line, bool abort = true ) +{ + if ( code != HIPRAND_STATUS_SUCCESS ) + { + printf( "HiprandAssert: %s:%d code=%d\n", file, line, code ); + if ( abort ) assert( code == HIPRAND_STATUS_SUCCESS ); + } +} +#endif /* clang-format on */ + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + //-------------------------------------------------------------------------- +#ifndef MGONGPU_HAS_NO_HIPRAND + HiprandRandomNumberKernel::HiprandRandomNumberKernel( BufferRndNumMomenta& rnarray, const bool onDevice ) + : RandomNumberKernelBase( rnarray ) + , m_isOnDevice( onDevice ) + { + if( m_isOnDevice ) + { +#ifdef MGONGPUCPP_GPUIMPL + if( !m_rnarray.isOnDevice() ) + throw std::runtime_error( "HiprandRandomNumberKernel on device with a host random number array" ); +#else + throw std::runtime_error( "HiprandRandomNumberKernel does not support HiprandDevice on CPU host" ); +#endif + } + else + { + if( m_rnarray.isOnDevice() ) + throw std::runtime_error( "HiprandRandomNumberKernel on host with a device random number array" ); + } + createGenerator(); + } + + //-------------------------------------------------------------------------- + + HiprandRandomNumberKernel::~HiprandRandomNumberKernel() + { + destroyGenerator(); + } + + //-------------------------------------------------------------------------- + + void HiprandRandomNumberKernel::seedGenerator( const unsigned int seed ) + { + if( m_isOnDevice ) + { + destroyGenerator(); // workaround for #429 + createGenerator(); // workaround for #429 + } + //printf( "seedGenerator: seed %d\n", seed ); + checkHiprand( hiprandSetPseudoRandomGeneratorSeed( m_rnGen, seed ) ); + } + + //-------------------------------------------------------------------------- + + void HiprandRandomNumberKernel::createGenerator() + { + //const hiprandRngType_t type = HIPRAND_RNG_PSEUDO_DEFAULT; + //const hiprandRngType_t type = HIPRAND_RNG_PSEUDO_XORWOW; + //const hiprandRngType_t type = HIPRAND_RNG_PSEUDO_MRG32K3A; + const hiprandRngType_t type = HIPRAND_RNG_PSEUDO_MTGP32; // same as curand; not implemented yet (code=1000) in host code + //const hiprandRngType_t type = HIPRAND_RNG_PSEUDO_MT19937; + //const hiprandRngType_t type = HIPRAND_RNG_PSEUDO_PHILOX4_32_10; + if( m_isOnDevice ) + { + checkHiprand( hiprandCreateGenerator( &m_rnGen, type ) ); + } + else + { + // See https://github.com/ROCm/hipRAND/issues/76 + throw std::runtime_error( "HiprandRandomNumberKernel on host is not supported yet (hiprandCreateGeneratorHost is not implemented yet)" ); + //checkHiprand( hiprandCreateGeneratorHost( &m_rnGen, type ) ); // ALWAYS FAILS WITH CODE=1000 + } + // FIXME: hiprand ordering is not implemented yet + // See https://github.com/ROCm/hipRAND/issues/75 + /* + //checkHiprand( hiprandSetGeneratorOrdering( *&m_rnGen, HIPRAND_ORDERING_PSEUDO_LEGACY ) ); + checkHiprand( hiprandSetGeneratorOrdering( *&m_rnGen, HIPRAND_ORDERING_PSEUDO_BEST ) ); + //checkHiprand( hiprandSetGeneratorOrdering( *&m_rnGen, HIPRAND_ORDERING_PSEUDO_DYNAMIC ) ); + //checkHiprand( hiprandSetGeneratorOrdering( *&m_rnGen, HIPRAND_ORDERING_PSEUDO_SEEDED ) ); + */ + } + + //-------------------------------------------------------------------------- + + void HiprandRandomNumberKernel::destroyGenerator() + { + checkHiprand( hiprandDestroyGenerator( m_rnGen ) ); + } + + //-------------------------------------------------------------------------- + + void HiprandRandomNumberKernel::generateRnarray() + { +#if defined MGONGPU_FPTYPE_DOUBLE + checkHiprand( hiprandGenerateUniformDouble( m_rnGen, m_rnarray.data(), m_rnarray.size() ) ); +#elif defined MGONGPU_FPTYPE_FLOAT + checkHiprand( hiprandGenerateUniform( m_rnGen, m_rnarray.data(), m_rnarray.size() ) ); +#endif + /* + printf( "\nHiprandRandomNumberKernel::generateRnarray size = %d\n", (int)m_rnarray.size() ); + fptype* data = m_rnarray.data(); +#ifdef MGONGPUCPP_GPUIMPL + if( m_rnarray.isOnDevice() ) + { + data = new fptype[m_rnarray.size()](); + checkCuda( cudaMemcpy( data, m_rnarray.data(), m_rnarray.bytes(), cudaMemcpyDeviceToHost ) ); + } +#endif + for( int i = 0; i < ( (int)m_rnarray.size() / 4 ); i++ ) + printf( "[%4d] %f %f %f %f\n", i * 4, data[i * 4], data[i * 4 + 2], data[i * 4 + 2], data[i * 4 + 3] ); +#ifdef MGONGPUCPP_GPUIMPL + if( m_rnarray.isOnDevice() ) delete[] data; +#endif + */ + } + + //-------------------------------------------------------------------------- +#endif +} diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/HiprandRandomNumberKernel.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/HiprandRandomNumberKernel.cc new file mode 120000 index 0000000000..6691864f78 --- /dev/null +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/HiprandRandomNumberKernel.cc @@ -0,0 +1 @@ +../HiprandRandomNumberKernel.cc \ No newline at end of file diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/check_sa.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/check_sa.cc index bde384c69e..e086ae12d9 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/check_sa.cc +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/check_sa.cc @@ -1,10 +1,10 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2023 CERN and UCLouvain. +// Copyright (C) 2020-2024 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -58,7 +58,7 @@ int usage( char* argv0, int ret = 1 ) { std::cout << "Usage: " << argv0 - << " [--verbose|-v] [--debug|-d] [--performance|-p] [--json|-j] [--curhst|--curdev|--common] [--rmbhst|--rmbdev] [--bridge]" + << " [--verbose|-v] [--debug|-d] [--performance|-p] [--json|-j] [--curhst|--curdev|--hirhst|--hirdev|--common] [--rmbhst|--rmbdev] [--bridge]" << " [#gpuBlocksPerGrid #gpuThreadsPerBlock] #iterations" << std::endl; std::cout << std::endl; std::cout << "The number of events per iteration is #gpuBlocksPerGrid * #gpuThreadsPerBlock" << std::endl; @@ -131,17 +131,31 @@ main( int argc, char** argv ) enum class RandomNumberMode { CommonRandom = 0, - CurandHost = 1, - CurandDevice = 2 + CurandHost = -1, + CurandDevice = 1, + HiprandHost = -2, + HiprandDevice = 2 }; -#ifdef MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785) -#elif defined __HIPCC__ -#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random -#elif defined __CUDACC__ +#if defined __CUDACC__ +#ifndef MGONGPU_HAS_NO_CURAND RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand #else + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on NVidia GPU if build has no curand (PR #784 and #785) +#endif +#elif defined __HIPCC__ +#ifndef MGONGPU_HAS_NO_HIPRAND + RandomNumberMode rndgen = RandomNumberMode::HiprandDevice; // default on AMD GPU if build has hiprand +#else + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on AMD GPU if build has no hiprand +#endif +#else +#ifndef MGONGPU_HAS_NO_CURAND RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand +#elif not defined MGONGPU_HAS_NO_HIPRAND + RandomNumberMode rndgen = RandomNumberMode::HiprandDevice; // default on CPU if build has hiprand +#else + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has neither curand nor hiprand +#endif #endif // Rambo sampling mode (NB RamboHost implies CommonRandom or CurandHost!) enum class RamboSamplingMode @@ -152,7 +166,7 @@ main( int argc, char** argv ) #ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -193,6 +207,26 @@ main( int argc, char** argv ) throw std::runtime_error( "CurandHost is not supported because this application was built without Curand support" ); #else rndgen = RandomNumberMode::CurandHost; +#endif + } + else if( arg == "--hirdev" ) + { +#ifndef __HIPCC__ + throw std::runtime_error( "HiprandDevice is not supported on CPUs or non-AMD GPUs" ); +#elif defined MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "HiprandDevice is not supported because this application was built without Hiprand support" ); +#else + rndgen = RandomNumberMode::HiprandDevice; +#endif + } + else if( arg == "--hirhst" ) + { +#ifdef MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "HiprandHost is not supported because this application was built without Hiprand support" ); +#else + // See https://github.com/ROCm/hipRAND/issues/76 + throw std::runtime_error( "HiprandRandomNumberKernel on host is not supported yet (hiprandCreateGeneratorHost is not implemented yet)" ); + //rndgen = RandomNumberMode::HiprandHost; #endif } else if( arg == "--common" ) @@ -265,6 +299,20 @@ main( int argc, char** argv ) #endif } + if( rmbsmp == RamboSamplingMode::RamboHost && rndgen == RandomNumberMode::HiprandDevice ) + { +#if not defined MGONGPU_HAS_NO_HIPRAND + // See https://github.com/ROCm/hipRAND/issues/76 + //std::cout << "WARNING! RamboHost selected: cannot use HiprandDevice, will use HiprandHost" << std::endl; + //rndgen = RandomNumberMode::HiprandHost; + std::cout << "WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet)" << std::endl; + rndgen = RandomNumberMode::CommonRandom; +#else + std::cout << "WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom" << std::endl; + rndgen = RandomNumberMode::CommonRandom; +#endif + } + constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout constexpr int neppR = MemoryAccessRandomNumbers::neppR; // AOSOA layout @@ -415,7 +463,7 @@ main( int argc, char** argv ) std::unique_ptr wavetimes( new double[niter] ); std::unique_ptr wv3atimes( new double[niter] ); - // --- 0c. Create curand or common generator + // --- 0c. Create curand, hiprand or common generator const std::string cgenKey = "0c GenCreat"; timermap.start( cgenKey ); // Allocate the appropriate RandomNumberKernel @@ -433,7 +481,7 @@ main( int argc, char** argv ) prnk.reset( new CurandRandomNumberKernel( hstRndmom, onDevice ) ); #endif } - else + else if( rndgen == RandomNumberMode::CurandDevice ) { #ifdef MGONGPU_HAS_NO_CURAND throw std::runtime_error( "INTERNAL ERROR! CurandDevice is not supported because this application was built without Curand support" ); // INTERNAL ERROR (no path to this statement) @@ -441,9 +489,31 @@ main( int argc, char** argv ) const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); #else - throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif } + else if( rndgen == RandomNumberMode::HiprandHost ) + { +#ifdef MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "INTERNAL ERROR! HiprandHost is not supported because this application was built without Hiprand support" ); // INTERNAL ERROR (no path to this statement) +#else + const bool onDevice = false; + prnk.reset( new HiprandRandomNumberKernel( hstRndmom, onDevice ) ); +#endif + } + else if( rndgen == RandomNumberMode::HiprandDevice ) + { +#ifdef MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "INTERNAL ERROR! HiprandDevice is not supported because this application was built without Hiprand support" ); // INTERNAL ERROR (no path to this statement) +#elif defined __HIPCC__ + const bool onDevice = true; + prnk.reset( new HiprandRandomNumberKernel( devRndmom, onDevice ) ); +#else + throw std::logic_error( "INTERNAL ERROR! HiprandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) +#endif + } + else + throw std::logic_error( "INTERNAL ERROR! Unknown rndgen value?" ); // INTERNAL ERROR (no path to this statement) // --- 0c. Create rambo sampling kernel [keep this in 0c for the moment] std::unique_ptr prsk; @@ -497,7 +567,7 @@ main( int argc, char** argv ) // *** START THE OLD-STYLE TIMER FOR RANDOM GEN *** double genrtime = 0; - // --- 1a. Seed rnd generator (to get same results on host and device in curand) + // --- 1a. Seed rnd generator (to get same results on host and device in curand/hiprand) // [NB This should not be necessary using the host API: "Generation functions // can be called multiple times on the same generator to generate successive // blocks of results. For pseudorandom generators, multiple calls to generation @@ -515,7 +585,9 @@ main( int argc, char** argv ) //std::cout << "Got random numbers" << std::endl; #ifdef MGONGPUCPP_GPUIMPL - if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) + if( rndgen != RandomNumberMode::CurandDevice && + rndgen != RandomNumberMode::HiprandDevice && + rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device const std::string htodKey = "1c CpHTDrnd"; @@ -761,6 +833,10 @@ main( int argc, char** argv ) rndgentxt = "CURAND HOST"; else if( rndgen == RandomNumberMode::CurandDevice ) rndgentxt = "CURAND DEVICE"; + else if( rndgen == RandomNumberMode::HiprandHost ) + rndgentxt = "ROCRAND HOST"; + else if( rndgen == RandomNumberMode::HiprandDevice ) + rndgentxt = "ROCRAND DEVICE"; #ifdef __CUDACC__ rndgentxt += " (CUDA code)"; #elif defined __HIPCC__ @@ -788,7 +864,7 @@ main( int argc, char** argv ) wrkflwtxt += "FLT+"; #else wrkflwtxt += "???+"; // no path to this statement -#endif /* clang-format on */ +#endif // -- CUCOMPLEX or THRUST or STD or CXSIMPLE complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -813,7 +889,7 @@ main( int argc, char** argv ) wrkflwtxt += "CXS:"; #else wrkflwtxt += "???:"; // no path to this statement -#endif +#endif /* clang-format on */ #endif // -- COMMON or CURAND HOST or CURAND DEVICE random numbers? if( rndgen == RandomNumberMode::CommonRandom ) @@ -822,6 +898,10 @@ main( int argc, char** argv ) wrkflwtxt += "CURHST+"; else if( rndgen == RandomNumberMode::CurandDevice ) wrkflwtxt += "CURDEV+"; + else if( rndgen == RandomNumberMode::HiprandHost ) + wrkflwtxt += "HIRHST+"; + else if( rndgen == RandomNumberMode::HiprandDevice ) + wrkflwtxt += "HIRDEV+"; else wrkflwtxt += "??????+"; // no path to this statement // -- HOST or DEVICE rambo sampling? @@ -1099,7 +1179,7 @@ main( int argc, char** argv ) #ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif - << "\"Curand generation\": " + << "\"Random generation\": " << "\"" << rndgentxt << "\"," << std::endl; double minelem = hstStats.minME; diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/RandomNumberKernels.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/RandomNumberKernels.h index 21d63beeac..7ed728a26c 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/RandomNumberKernels.h +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/RandomNumberKernels.h @@ -1,21 +1,22 @@ -// Copyright (C) 2020-2023 CERN and UCLouvain. +// Copyright (C) 2020-2024 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. #ifndef RANDOMNUMBERKERNELS_H #define RANDOMNUMBERKERNELS_H 1 #include "mgOnGpuConfig.h" -// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when MGONGPUCPP_GPUIMPL is not defined -#ifndef MGONGPU_HAS_NO_CURAND -//#include "curand.h" -struct curandGenerator_st; // forward definition from curand.h -#endif - #include "MemoryBuffers.h" +// Forward definition from curand.h (the full header is only needed in CurandRandomKernel.cc) +struct curandGenerator_st; + +// Forward definition from hiprand.h (the full header is only needed in HiprandRandomKernel.cc) +struct rocrand_generator_base_type; +typedef rocrand_generator_base_type hiprandGenerator_st; + #ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else @@ -107,7 +108,6 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef MGONGPU_HAS_NO_CURAND // A class encapsulating CURAND random number generation on a CPU host or on a GPU device class CurandRandomNumberKernel final : public RandomNumberKernelBase { @@ -142,11 +142,49 @@ namespace mg5amcCpu const bool m_isOnDevice; // The curand generator - // (NB: curand.h defines typedef generator_t as a pointer to forward-defined 'struct curandGenerator_st') + // (NB: curand.h defines typedef curandGenerator_t as a pointer to forward-defined 'struct curandGenerator_st') curandGenerator_st* m_rnGen; }; -#endif + //-------------------------------------------------------------------------- + + // A class encapsulating HIPRAND random number generation on a CPU host or on a GPU device + class HiprandRandomNumberKernel final : public RandomNumberKernelBase + { + public: + + // Constructor from an existing output buffer + HiprandRandomNumberKernel( BufferRndNumMomenta& rnarray, const bool onDevice ); + + // Destructor + ~HiprandRandomNumberKernel(); + + // Seed the random number generator + void seedGenerator( const unsigned int seed ) override final; + + // Generate the random number array + void generateRnarray() override final; + + // Is this a host or device kernel? + bool isOnDevice() const override final { return m_isOnDevice; } + + private: + + // Create the generator (workaround for #429: do this in every seedGenerator call rather than only in the ctor) + void createGenerator(); + + // Destroy the generator (workaround for #429: do this in every seedGenerator call rather than only in the ctor) + void destroyGenerator(); + + private: + + // Is this a host or device kernel? + const bool m_isOnDevice; + + // The hiprand generator + // (NB: hiprand.h defines typedef hiprandGenerator_t as a pointer to forward-defined 'struct hiprandGenerator_st') + hiprandGenerator_st* m_rnGen; + }; //-------------------------------------------------------------------------- } diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk index 1077bdc098..3ad91dfd59 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk @@ -173,11 +173,6 @@ ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) comma:=, CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch)) CUINC = -I$(CUDA_HOME)/include/ - ifeq ($(RNDGEN),hasNoCurand) - CURANDLIBFLAGS= - else - CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here! - endif CUOPTFLAGS = -lineinfo ###GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math GPUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math @@ -241,7 +236,7 @@ else override GPUCC= override USE_NVTX= override CUINC= - override CURANDLIBFLAGS= + override HIPINC= endif @@ -291,7 +286,7 @@ endif #------------------------------------------------------------------------------- -#=== Configure defaults and check if user-defined choices exist for OMPFLAGS, AVX, FPTYPE, HELINL, HRDCOD, RNDGEN +#=== Configure defaults and check if user-defined choices exist for OMPFLAGS, AVX, FPTYPE, HELINL, HRDCOD # Set the default OMPFLAGS choice ifneq ($(findstring hipcc,$(GPUCC)),) @@ -352,29 +347,62 @@ ifeq ($(HRDCOD),) override HRDCOD = 0 endif -# Set the default RNDGEN (random number generator) choice -ifeq ($(RNDGEN),) - ifeq ($(GPUCC),) - override RNDGEN = hasNoCurand - # Edgecase for HIP compilation - else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) - override RNDGEN = hasNoCurand - else ifeq ($(RNDGEN),) - override RNDGEN = hasCurand - endif -endif - -# Export AVX, FPTYPE, HELINL, HRDCOD, RNDGEN, OMPFLAGS so that it is not necessary to pass them to the src Makefile too +# Export AVX, FPTYPE, HELINL, HRDCOD, OMPFLAGS so that it is not necessary to pass them to the src Makefile too export AVX export FPTYPE export HELINL export HRDCOD -export RNDGEN export OMPFLAGS #------------------------------------------------------------------------------- -#=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN +#=== Configure defaults and check if user-defined choices exist for RNDGEN (legacy!), HASCURAND, HASHIPRAND + +# If the legacy RNDGEN exists, this take precedence over any HASCURAND choice (but a warning is printed out) +###$(info RNDGEN=$(RNDGEN)) +ifneq ($(RNDGEN),) + $(warning Environment variable RNDGEN is no longer supported, please use HASCURAND instead!) + ifeq ($(RNDGEN),hasCurand) + override HASCURAND = $(RNDGEN) + else ifeq ($(RNDGEN),hasNoCurand) + override HASCURAND = $(RNDGEN) + else ifneq ($(RNDGEN),hasNoCurand) + $(error Unknown RNDGEN='$(RNDGEN)': only 'hasCurand' and 'hasNoCurand' are supported - but use HASCURAND instead!) + endif +endif + +# Set the default HASCURAND (curand random number generator) choice, if no prior choice exists for HASCURAND +# (NB: allow HASCURAND=hasCurand even if $(GPUCC) does not point to nvcc: assume CUDA_HOME was defined correctly...) +ifeq ($(HASCURAND),) + ifeq ($(GPUCC),) # CPU-only build + override HASCURAND = hasNoCurand + else ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build + override HASCURAND = hasCurand + else # non-Nvidia GPU build + override HASCURAND = hasNoCurand + endif +endif + +# Set the default HASHIPRAND (hiprand random number generator) choice, if no prior choice exists for HASHIPRAND +# (NB: allow HASHIPRAND=hasHiprand even if $(GPUCC) does not point to hipcc: assume HIP_HOME was defined correctly...) +ifeq ($(HASHIPRAND),) + ifeq ($(GPUCC),) # CPU-only build + override HASHIPRAND = hasNoHiprand + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build + override HASHIPRAND = hasHiprand + else # non-AMD GPU build + override HASHIPRAND = hasNoHiprand + endif +endif + +# Export HASCURAND, HASHIPRAND so that it is not necessary to pass them to the src Makefile too +# (NB: these variables in cudacpp_src.mk are only used to define the build tag, they are NOT needed for RNDCXXFLAGS or RNDLIBFLAGS) +export HASCURAND +export HASHIPRAND + +#------------------------------------------------------------------------------- + +#=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD # Set the build flags appropriate to OMPFLAGS $(info OMPFLAGS=$(OMPFLAGS)) @@ -432,13 +460,13 @@ CXXFLAGS+= $(AVXFLAGS) $(info FPTYPE=$(FPTYPE)) ifeq ($(FPTYPE),d) CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE - GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE + GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE else ifeq ($(FPTYPE),f) CXXFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT - GPUFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT + GPUFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT else ifeq ($(FPTYPE),m) CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT - GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT + GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT else $(error Unknown FPTYPE='$(FPTYPE)': only 'd', 'f' and 'm' are supported) endif @@ -447,7 +475,7 @@ endif $(info HELINL=$(HELINL)) ifeq ($(HELINL),1) CXXFLAGS += -DMGONGPU_INLINE_HELAMPS - GPUFLAGS += -DMGONGPU_INLINE_HELAMPS + GPUFLAGS += -DMGONGPU_INLINE_HELAMPS else ifneq ($(HELINL),0) $(error Unknown HELINL='$(HELINL)': only '0' and '1' are supported) endif @@ -456,21 +484,40 @@ endif $(info HRDCOD=$(HRDCOD)) ifeq ($(HRDCOD),1) CXXFLAGS += -DMGONGPU_HARDCODE_PARAM - GPUFLAGS += -DMGONGPU_HARDCODE_PARAM + GPUFLAGS += -DMGONGPU_HARDCODE_PARAM else ifneq ($(HRDCOD),0) $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported) endif -# Set the build flags appropriate to each RNDGEN choice (example: "make RNDGEN=hasNoCurand") -$(info RNDGEN=$(RNDGEN)) -ifeq ($(RNDGEN),hasNoCurand) - override CXXFLAGSCURAND = -DMGONGPU_HAS_NO_CURAND -else ifeq ($(RNDGEN),hasCurand) - override CXXFLAGSCURAND = + +#=== Set the CUDA/HIP/C++ compiler and linker flags appropriate to user-defined choices of HASCURAND, HASHIPRAND + +$(info HASCURAND=$(HASCURAND)) +$(info HASHIPRAND=$(HASHIPRAND)) +override RNDCXXFLAGS= +override RNDLIBFLAGS= + +# Set the RNDCXXFLAGS and RNDLIBFLAGS build flags appropriate to each HASCURAND choice (example: "make HASCURAND=hasNoCurand") +ifeq ($(HASCURAND),hasNoCurand) + override RNDCXXFLAGS += -DMGONGPU_HAS_NO_CURAND +else ifeq ($(HASCURAND),hasCurand) + override RNDLIBFLAGS += -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here! else - $(error Unknown RNDGEN='$(RNDGEN)': only 'hasCurand' and 'hasNoCurand' are supported) + $(error Unknown HASCURAND='$(HASCURAND)': only 'hasCurand' and 'hasNoCurand' are supported) +endif + +# Set the RNDCXXFLAGS and RNDLIBFLAGS build flags appropriate to each HASHIPRAND choice (example: "make HASHIPRAND=hasNoHiprand") +ifeq ($(HASHIPRAND),hasNoHiprand) + override RNDCXXFLAGS += -DMGONGPU_HAS_NO_HIPRAND +else ifeq ($(HASHIPRAND),hasHiprand) + override RNDLIBFLAGS += -L$(HIP_HOME)/lib/ -lhiprand +else ifneq ($(HASHIPRAND),hasHiprand) + $(error Unknown HASHIPRAND='$(HASHIPRAND)': only 'hasHiprand' and 'hasNoHiprand' are supported) endif +#$(info RNDCXXFLAGS=$(RNDCXXFLAGS)) +#$(info HASHIPRAND=$(HASHIPRAND)) + #------------------------------------------------------------------------------- #=== Configure build directories and build lockfiles === @@ -481,7 +528,7 @@ override DIRTAG = $(AVX)_$(FPTYPE)_inl$(HELINL)_hrd$(HRDCOD) # Build lockfile "full" tag (defines full specification of build options that cannot be intermixed) # (Rationale: avoid mixing of CUDA and no-CUDA environment builds with different random number generators) -override TAG = $(AVX)_$(FPTYPE)_inl$(HELINL)_hrd$(HRDCOD)_$(RNDGEN) +override TAG = $(AVX)_$(FPTYPE)_inl$(HELINL)_hrd$(HRDCOD)_$(HASCURAND)_$(HASHIPRAND) # Build directory: current directory by default, or build.$(DIRTAG) if USEBUILDDIR==1 ifeq ($(USEBUILDDIR),1) @@ -589,14 +636,19 @@ endif $(BUILDDIR)/check_sa.o: CXXFLAGS += $(USE_NVTX) $(CUINC) $(BUILDDIR)/check_sa_cu.o: CXXFLAGS += $(USE_NVTX) $(CUINC) -# Apply special build flags only to check_sa[_cu].o and CurandRandomNumberKernel[_cu].o (curand headers, #679) -$(BUILDDIR)/check_sa.o: CXXFLAGS += $(CXXFLAGSCURAND) -$(BUILDDIR)/check_sa_cu.o: CUFLAGS += $(CXXFLAGSCURAND) -$(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CXXFLAGSCURAND) -$(BUILDDIR)/CurandRandomNumberKernel_cu.o: CUFLAGS += $(CXXFLAGSCURAND) -ifeq ($(RNDGEN),hasCurand) +# Apply special build flags only to check_sa[_cu].o and (Cu|Hip)randRandomNumberKernel[_cu].o +$(BUILDDIR)/check_sa.o: CXXFLAGS += $(RNDCXXFLAGS) +$(BUILDDIR)/check_sa_cu.o: CUFLAGS += $(RNDCXXFLAGS) +$(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(RNDCXXFLAGS) +$(BUILDDIR)/CurandRandomNumberKernel_cu.o: CUFLAGS += $(RNDCXXFLAGS) +$(BUILDDIR)/HiprandRandomNumberKernel.o: CXXFLAGS += $(RNDCXXFLAGS) +$(BUILDDIR)/HiprandRandomNumberKernel_cu.o: CUFLAGS += $(RNDCXXFLAGS) +ifeq ($(HASCURAND),hasCurand) # curand headers, #679 $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUINC) endif +ifeq ($(HASHIPRAND),hasHiprand) # hiprand headers +$(BUILDDIR)/HiprandRandomNumberKernel.o: CXXFLAGS += $(HIPINC) +endif # Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592) ifneq ($(shell $(CXX) --version | egrep '^(Intel)'),) @@ -673,8 +725,8 @@ endif # Target (and build rules): C++ and CUDA standalone executables $(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o - $(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CURANDLIBFLAGS) +$(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(BUILDDIR)/HiprandRandomNumberKernel.o + $(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(BUILDDIR)/HiprandRandomNumberKernel.o $(RNDLIBFLAGS) ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) @@ -684,8 +736,8 @@ else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(cu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif $(cu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(cu_main): $(BUILDDIR)/check_sa_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_cu.o - $(GPUCC) -o $@ $(BUILDDIR)/check_sa_cu.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_cu.o $(CURANDLIBFLAGS) +$(cu_main): $(BUILDDIR)/check_sa_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_cu.o $(BUILDDIR)/HiprandRandomNumberKernel_cu.o + $(GPUCC) -o $@ $(BUILDDIR)/check_sa_cu.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_cu.o $(BUILDDIR)/HiprandRandomNumberKernel_cu.o $(RNDLIBFLAGS) endif #------------------------------------------------------------------------------- diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/runTest.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/runTest.cc index de327f2321..7f248d29a4 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/runTest.cc +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/runTest.cc @@ -238,7 +238,7 @@ struct CUDATest : public CUDA_CPU_TestBase return MemoryAccessMatrixElements::ieventAccessConst( hstMatrixElements.data(), ievt ); } }; -#endif +#endif /* clang-format off */ // Use two levels of macros to force stringification at the right level // (see https://gcc.gnu.org/onlinedocs/gcc-3.0.1/cpp_3.html#SEC17 and https://stackoverflow.com/a/3419392) @@ -260,4 +260,4 @@ INSTANTIATE_TEST_SUITE_P( prefix, \ MG_INSTANTIATE_TEST_SUITE_GPU( XTESTID_GPU( MG_EPOCH_PROCESS_ID ), MadgraphTest ); #else MG_INSTANTIATE_TEST_SUITE_CPU( XTESTID_CPU( MG_EPOCH_PROCESS_ID ), MadgraphTest ); -#endif +#endif /* clang-format on */ diff --git a/epochX/cudacpp/ee_mumu.sa/src/cudacpp_src.mk b/epochX/cudacpp/ee_mumu.sa/src/cudacpp_src.mk index b2b9da5288..d7d51259d2 100644 --- a/epochX/cudacpp/ee_mumu.sa/src/cudacpp_src.mk +++ b/epochX/cudacpp/ee_mumu.sa/src/cudacpp_src.mk @@ -45,13 +45,13 @@ endif #------------------------------------------------------------------------------- -#=== Configure the CUDA compiler (note: GPUCC is already exported including ccache) +#=== Configure the CUDA compiler (note: GPUCC have been exported from cudacpp.mk including ccache) ###$(info GPUCC=$(GPUCC)) #------------------------------------------------------------------------------- -#=== Configure ccache for C++ builds (note: GPUCC is already exported including ccache) +#=== Configure ccache for C++ builds (note: GPUCC have been exported from cudacpp.mk including ccache) # Enable ccache if USECCACHE=1 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) @@ -86,7 +86,8 @@ endif #------------------------------------------------------------------------------- -#=== Set the CUDA/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN +#=== Set the CUDA/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD (exported from cudacpp.mk) +#=== (NB the RNDCXXFLAGS and RNDLIBFLAGS appropriate to user-defined choices of HASCURAND and HASHIPRAND have been exported from cudacpp.mk) # Set the build flags appropriate to OMPFLAGS ###$(info OMPFLAGS=$(OMPFLAGS)) @@ -175,14 +176,6 @@ else ifneq ($(HRDCOD),0) $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported) endif -# Set the build flags appropriate to each RNDGEN choice (example: "make RNDGEN=hasNoCurand") -###$(info RNDGEN=$(RNDGEN)) -ifeq ($(RNDGEN),hasNoCurand) - CXXFLAGS += -DMGONGPU_HAS_NO_CURAND -else ifneq ($(RNDGEN),hasCurand) - $(error Unknown RNDGEN='$(RNDGEN)': only 'hasCurand' and 'hasNoCurand' are supported) -endif - #------------------------------------------------------------------------------- #=== Configure build directories and build lockfiles === @@ -193,7 +186,7 @@ override DIRTAG = $(AVX)_$(FPTYPE)_inl$(HELINL)_hrd$(HRDCOD) # Build lockfile "full" tag (defines full specification of build options that cannot be intermixed) # (Rationale: avoid mixing of CUDA and no-CUDA environment builds with different random number generators) -override TAG = $(AVX)_$(FPTYPE)_inl$(HELINL)_hrd$(HRDCOD)_$(RNDGEN) +override TAG = $(AVX)_$(FPTYPE)_inl$(HELINL)_hrd$(HRDCOD)_$(HASCURAND)_$(HASHIPRAND) # Build directory: current directory by default, or build.$(DIRTAG) if USEBUILDDIR==1 ###$(info Current directory is $(shell pwd)) diff --git a/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuConfig.h index 475749ca7c..22d6921fda 100644 --- a/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuConfig.h @@ -20,11 +20,15 @@ #undef MGONGPUCPP_GPUIMPL #endif +// Make sure that __HIP_PLATFORM_NVIDIA__ is undefined +// (__HIP_PLATFORM_AMD__ is defined by hipcc or in HiprandRandomNumberKernel.cc) +#undef __HIP_PLATFORM_NVIDIA__ // disable hiprand for NVidia (curand) + // ** NB1 Throughputs (e.g. 6.8E8) are events/sec for "./gcheck.exe -p 65536 128 12" // ** NB2 Baseline on b7g47n0004 fluctuates (probably depends on load on other VMs) // Choose if curand is supported for generating random numbers -// For HIP, by default, do not use curand (common random numbers will be used instead) +// For HIP, by default, do not allow curand to be used (hiprand or common random numbers will be used instead) // For both CUDA and C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND // (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784 and #785) #if defined __HIPCC__ @@ -39,6 +43,22 @@ //#endif #endif +// Choose if hiprand is supported for generating random numbers +// For CUDA, by default, do not allow hiprand to be used (curand or common random numbers will be used instead) +// For both HIP and C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_HIPRAND +// (there may exist HIP installations which do not include hiprand?) +#if defined __CUDACC__ +#define MGONGPU_HAS_NO_HIPRAND 1 +#else +//#ifdef __HIPCC__ +//#undef MGONGPU_HAS_NO_HIPRAND // default +////#define MGONGPU_HAS_NO_HIPRAND 1 +//#else +//#undef MGONGPU_HAS_NO_HIPRAND // default +////#define MGONGPU_HAS_NO_HIPRAND 1 +//#endif +#endif + // Choose floating point precision (for everything but color algebra #537) // If one of these macros has been set from outside with e.g. -DMGONGPU_FPTYPE_FLOAT, nothing happens (issue #167) #if not defined MGONGPU_FPTYPE_DOUBLE and not defined MGONGPU_FPTYPE_FLOAT diff --git a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt index 87b9927ea2..465f0fdf8e 100644 --- a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt +++ b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.0057108402252197266  +DEBUG: model prefixing takes 0.005699872970581055  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ WEIGHTED<=2 @1 INFO: Process has 3 diagrams -1 processes with 3 diagrams generated in 0.009 s +1 processes with 3 diagrams generated in 0.008 s Total: 1 processes with 3 diagrams output madevent ../TMPOUT/CODEGEN_mad_gg_tt --hel_recycling=False --vector_size=32 --me_exporter=standalone_cudacpp Load PLUGIN.CUDACPP_OUTPUT @@ -164,10 +164,10 @@ It has been validated for the last time with version: 3.5.2 Addition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT Output will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: cformat =  standalone_cudacpp [export_cpp.py at line 3071]  -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 160]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 161]  INFO: initialize a new directory: CODEGEN_mad_gg_tt INFO: remove old information in CODEGEN_mad_gg_tt -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 165]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 166]  WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt  INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards  @@ -177,7 +177,7 @@ INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1 INFO: Processing color information for process: g g > t t~ @1 INFO: Creating files in directory P1_gg_ttx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1057]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -193,16 +193,16 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. INFO: Generating Feynman diagrams for Process: g g > t t~ WEIGHTED<=2 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttx Generated helas calls for 1 subprocesses (3 diagrams) in 0.006 s -Wrote files for 10 helas calls in 0.106 s +Wrote files for 10 helas calls in 0.099 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 2 routines in 0.148 s -DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202]  +ALOHA: aloha creates 2 routines in 0.146 s +DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 203]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 4 routines in 0.135 s +ALOHA: aloha creates 4 routines in 0.132 s VVV1 FFV1 FFV1 @@ -232,17 +232,17 @@ DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/S patching file auto_dsig1.f patching file driver.f patching file matrix1.f -DEBUG: p.returncode =  0 [output.py at line 237]  +DEBUG: p.returncode =  0 [output.py at line 238]  Output to directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt done. Type "launch" to generate events from this process, or see /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/README Run "open index.html" to see more information about this process. quit -real 0m1.795s -user 0m1.517s -sys 0m0.229s -Code generation completed in 2 seconds +real 0m1.712s +user 0m1.461s +sys 0m0.230s +Code generation completed in 1 seconds ************************************************************ * * * W E L C O M E to * diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/CurandRandomNumberKernel.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/CurandRandomNumberKernel.cc index 08a16f6f2c..c160c5e06b 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/CurandRandomNumberKernel.cc +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/CurandRandomNumberKernel.cc @@ -10,6 +10,7 @@ #include #ifndef MGONGPU_HAS_NO_CURAND /* clang-format off */ +// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when MGONGPUCPP_GPUIMPL is not defined #include "curand.h" #define checkCurand( code ){ assertCurand( code, __FILE__, __LINE__ ); } inline void assertCurand( curandStatus_t code, const char *file, int line, bool abort = true ) diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/HiprandRandomNumberKernel.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/HiprandRandomNumberKernel.cc new file mode 100644 index 0000000000..2e4534f9d4 --- /dev/null +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/HiprandRandomNumberKernel.cc @@ -0,0 +1,145 @@ +// Copyright (C) 2020-2024 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Jan 2024) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2024) for the MG5aMC CUDACPP plugin. + +#include "mgOnGpuConfig.h" + +#include "GpuRuntime.h" +#include "MemoryBuffers.h" +#include "RandomNumberKernels.h" + +#include + +#ifndef MGONGPU_HAS_NO_HIPRAND /* clang-format off */ +#ifndef __HIP_PLATFORM_AMD__ +#define __HIP_PLATFORM_AMD__ 1 // enable hiprand for AMD (rocrand) +#endif +#include +#define checkHiprand( code ){ assertHiprand( code, __FILE__, __LINE__ ); } +inline void assertHiprand( hiprandStatus_t code, const char *file, int line, bool abort = true ) +{ + if ( code != HIPRAND_STATUS_SUCCESS ) + { + printf( "HiprandAssert: %s:%d code=%d\n", file, line, code ); + if ( abort ) assert( code == HIPRAND_STATUS_SUCCESS ); + } +} +#endif /* clang-format on */ + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + //-------------------------------------------------------------------------- +#ifndef MGONGPU_HAS_NO_HIPRAND + HiprandRandomNumberKernel::HiprandRandomNumberKernel( BufferRndNumMomenta& rnarray, const bool onDevice ) + : RandomNumberKernelBase( rnarray ) + , m_isOnDevice( onDevice ) + { + if( m_isOnDevice ) + { +#ifdef MGONGPUCPP_GPUIMPL + if( !m_rnarray.isOnDevice() ) + throw std::runtime_error( "HiprandRandomNumberKernel on device with a host random number array" ); +#else + throw std::runtime_error( "HiprandRandomNumberKernel does not support HiprandDevice on CPU host" ); +#endif + } + else + { + if( m_rnarray.isOnDevice() ) + throw std::runtime_error( "HiprandRandomNumberKernel on host with a device random number array" ); + } + createGenerator(); + } + + //-------------------------------------------------------------------------- + + HiprandRandomNumberKernel::~HiprandRandomNumberKernel() + { + destroyGenerator(); + } + + //-------------------------------------------------------------------------- + + void HiprandRandomNumberKernel::seedGenerator( const unsigned int seed ) + { + if( m_isOnDevice ) + { + destroyGenerator(); // workaround for #429 + createGenerator(); // workaround for #429 + } + //printf( "seedGenerator: seed %d\n", seed ); + checkHiprand( hiprandSetPseudoRandomGeneratorSeed( m_rnGen, seed ) ); + } + + //-------------------------------------------------------------------------- + + void HiprandRandomNumberKernel::createGenerator() + { + //const hiprandRngType_t type = HIPRAND_RNG_PSEUDO_DEFAULT; + //const hiprandRngType_t type = HIPRAND_RNG_PSEUDO_XORWOW; + //const hiprandRngType_t type = HIPRAND_RNG_PSEUDO_MRG32K3A; + const hiprandRngType_t type = HIPRAND_RNG_PSEUDO_MTGP32; // same as curand; not implemented yet (code=1000) in host code + //const hiprandRngType_t type = HIPRAND_RNG_PSEUDO_MT19937; + //const hiprandRngType_t type = HIPRAND_RNG_PSEUDO_PHILOX4_32_10; + if( m_isOnDevice ) + { + checkHiprand( hiprandCreateGenerator( &m_rnGen, type ) ); + } + else + { + // See https://github.com/ROCm/hipRAND/issues/76 + throw std::runtime_error( "HiprandRandomNumberKernel on host is not supported yet (hiprandCreateGeneratorHost is not implemented yet)" ); + //checkHiprand( hiprandCreateGeneratorHost( &m_rnGen, type ) ); // ALWAYS FAILS WITH CODE=1000 + } + // FIXME: hiprand ordering is not implemented yet + // See https://github.com/ROCm/hipRAND/issues/75 + /* + //checkHiprand( hiprandSetGeneratorOrdering( *&m_rnGen, HIPRAND_ORDERING_PSEUDO_LEGACY ) ); + checkHiprand( hiprandSetGeneratorOrdering( *&m_rnGen, HIPRAND_ORDERING_PSEUDO_BEST ) ); + //checkHiprand( hiprandSetGeneratorOrdering( *&m_rnGen, HIPRAND_ORDERING_PSEUDO_DYNAMIC ) ); + //checkHiprand( hiprandSetGeneratorOrdering( *&m_rnGen, HIPRAND_ORDERING_PSEUDO_SEEDED ) ); + */ + } + + //-------------------------------------------------------------------------- + + void HiprandRandomNumberKernel::destroyGenerator() + { + checkHiprand( hiprandDestroyGenerator( m_rnGen ) ); + } + + //-------------------------------------------------------------------------- + + void HiprandRandomNumberKernel::generateRnarray() + { +#if defined MGONGPU_FPTYPE_DOUBLE + checkHiprand( hiprandGenerateUniformDouble( m_rnGen, m_rnarray.data(), m_rnarray.size() ) ); +#elif defined MGONGPU_FPTYPE_FLOAT + checkHiprand( hiprandGenerateUniform( m_rnGen, m_rnarray.data(), m_rnarray.size() ) ); +#endif + /* + printf( "\nHiprandRandomNumberKernel::generateRnarray size = %d\n", (int)m_rnarray.size() ); + fptype* data = m_rnarray.data(); +#ifdef MGONGPUCPP_GPUIMPL + if( m_rnarray.isOnDevice() ) + { + data = new fptype[m_rnarray.size()](); + checkCuda( cudaMemcpy( data, m_rnarray.data(), m_rnarray.bytes(), cudaMemcpyDeviceToHost ) ); + } +#endif + for( int i = 0; i < ( (int)m_rnarray.size() / 4 ); i++ ) + printf( "[%4d] %f %f %f %f\n", i * 4, data[i * 4], data[i * 4 + 2], data[i * 4 + 2], data[i * 4 + 3] ); +#ifdef MGONGPUCPP_GPUIMPL + if( m_rnarray.isOnDevice() ) delete[] data; +#endif + */ + } + + //-------------------------------------------------------------------------- +#endif +} diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/HiprandRandomNumberKernel.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/HiprandRandomNumberKernel.cc new file mode 120000 index 0000000000..6691864f78 --- /dev/null +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/HiprandRandomNumberKernel.cc @@ -0,0 +1 @@ +../HiprandRandomNumberKernel.cc \ No newline at end of file diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/check_sa.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/check_sa.cc index bde384c69e..e086ae12d9 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/check_sa.cc +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/check_sa.cc @@ -1,10 +1,10 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2023 CERN and UCLouvain. +// Copyright (C) 2020-2024 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -58,7 +58,7 @@ int usage( char* argv0, int ret = 1 ) { std::cout << "Usage: " << argv0 - << " [--verbose|-v] [--debug|-d] [--performance|-p] [--json|-j] [--curhst|--curdev|--common] [--rmbhst|--rmbdev] [--bridge]" + << " [--verbose|-v] [--debug|-d] [--performance|-p] [--json|-j] [--curhst|--curdev|--hirhst|--hirdev|--common] [--rmbhst|--rmbdev] [--bridge]" << " [#gpuBlocksPerGrid #gpuThreadsPerBlock] #iterations" << std::endl; std::cout << std::endl; std::cout << "The number of events per iteration is #gpuBlocksPerGrid * #gpuThreadsPerBlock" << std::endl; @@ -131,17 +131,31 @@ main( int argc, char** argv ) enum class RandomNumberMode { CommonRandom = 0, - CurandHost = 1, - CurandDevice = 2 + CurandHost = -1, + CurandDevice = 1, + HiprandHost = -2, + HiprandDevice = 2 }; -#ifdef MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785) -#elif defined __HIPCC__ -#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random -#elif defined __CUDACC__ +#if defined __CUDACC__ +#ifndef MGONGPU_HAS_NO_CURAND RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand #else + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on NVidia GPU if build has no curand (PR #784 and #785) +#endif +#elif defined __HIPCC__ +#ifndef MGONGPU_HAS_NO_HIPRAND + RandomNumberMode rndgen = RandomNumberMode::HiprandDevice; // default on AMD GPU if build has hiprand +#else + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on AMD GPU if build has no hiprand +#endif +#else +#ifndef MGONGPU_HAS_NO_CURAND RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand +#elif not defined MGONGPU_HAS_NO_HIPRAND + RandomNumberMode rndgen = RandomNumberMode::HiprandDevice; // default on CPU if build has hiprand +#else + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has neither curand nor hiprand +#endif #endif // Rambo sampling mode (NB RamboHost implies CommonRandom or CurandHost!) enum class RamboSamplingMode @@ -152,7 +166,7 @@ main( int argc, char** argv ) #ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -193,6 +207,26 @@ main( int argc, char** argv ) throw std::runtime_error( "CurandHost is not supported because this application was built without Curand support" ); #else rndgen = RandomNumberMode::CurandHost; +#endif + } + else if( arg == "--hirdev" ) + { +#ifndef __HIPCC__ + throw std::runtime_error( "HiprandDevice is not supported on CPUs or non-AMD GPUs" ); +#elif defined MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "HiprandDevice is not supported because this application was built without Hiprand support" ); +#else + rndgen = RandomNumberMode::HiprandDevice; +#endif + } + else if( arg == "--hirhst" ) + { +#ifdef MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "HiprandHost is not supported because this application was built without Hiprand support" ); +#else + // See https://github.com/ROCm/hipRAND/issues/76 + throw std::runtime_error( "HiprandRandomNumberKernel on host is not supported yet (hiprandCreateGeneratorHost is not implemented yet)" ); + //rndgen = RandomNumberMode::HiprandHost; #endif } else if( arg == "--common" ) @@ -265,6 +299,20 @@ main( int argc, char** argv ) #endif } + if( rmbsmp == RamboSamplingMode::RamboHost && rndgen == RandomNumberMode::HiprandDevice ) + { +#if not defined MGONGPU_HAS_NO_HIPRAND + // See https://github.com/ROCm/hipRAND/issues/76 + //std::cout << "WARNING! RamboHost selected: cannot use HiprandDevice, will use HiprandHost" << std::endl; + //rndgen = RandomNumberMode::HiprandHost; + std::cout << "WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet)" << std::endl; + rndgen = RandomNumberMode::CommonRandom; +#else + std::cout << "WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom" << std::endl; + rndgen = RandomNumberMode::CommonRandom; +#endif + } + constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout constexpr int neppR = MemoryAccessRandomNumbers::neppR; // AOSOA layout @@ -415,7 +463,7 @@ main( int argc, char** argv ) std::unique_ptr wavetimes( new double[niter] ); std::unique_ptr wv3atimes( new double[niter] ); - // --- 0c. Create curand or common generator + // --- 0c. Create curand, hiprand or common generator const std::string cgenKey = "0c GenCreat"; timermap.start( cgenKey ); // Allocate the appropriate RandomNumberKernel @@ -433,7 +481,7 @@ main( int argc, char** argv ) prnk.reset( new CurandRandomNumberKernel( hstRndmom, onDevice ) ); #endif } - else + else if( rndgen == RandomNumberMode::CurandDevice ) { #ifdef MGONGPU_HAS_NO_CURAND throw std::runtime_error( "INTERNAL ERROR! CurandDevice is not supported because this application was built without Curand support" ); // INTERNAL ERROR (no path to this statement) @@ -441,9 +489,31 @@ main( int argc, char** argv ) const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); #else - throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif } + else if( rndgen == RandomNumberMode::HiprandHost ) + { +#ifdef MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "INTERNAL ERROR! HiprandHost is not supported because this application was built without Hiprand support" ); // INTERNAL ERROR (no path to this statement) +#else + const bool onDevice = false; + prnk.reset( new HiprandRandomNumberKernel( hstRndmom, onDevice ) ); +#endif + } + else if( rndgen == RandomNumberMode::HiprandDevice ) + { +#ifdef MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "INTERNAL ERROR! HiprandDevice is not supported because this application was built without Hiprand support" ); // INTERNAL ERROR (no path to this statement) +#elif defined __HIPCC__ + const bool onDevice = true; + prnk.reset( new HiprandRandomNumberKernel( devRndmom, onDevice ) ); +#else + throw std::logic_error( "INTERNAL ERROR! HiprandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) +#endif + } + else + throw std::logic_error( "INTERNAL ERROR! Unknown rndgen value?" ); // INTERNAL ERROR (no path to this statement) // --- 0c. Create rambo sampling kernel [keep this in 0c for the moment] std::unique_ptr prsk; @@ -497,7 +567,7 @@ main( int argc, char** argv ) // *** START THE OLD-STYLE TIMER FOR RANDOM GEN *** double genrtime = 0; - // --- 1a. Seed rnd generator (to get same results on host and device in curand) + // --- 1a. Seed rnd generator (to get same results on host and device in curand/hiprand) // [NB This should not be necessary using the host API: "Generation functions // can be called multiple times on the same generator to generate successive // blocks of results. For pseudorandom generators, multiple calls to generation @@ -515,7 +585,9 @@ main( int argc, char** argv ) //std::cout << "Got random numbers" << std::endl; #ifdef MGONGPUCPP_GPUIMPL - if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) + if( rndgen != RandomNumberMode::CurandDevice && + rndgen != RandomNumberMode::HiprandDevice && + rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device const std::string htodKey = "1c CpHTDrnd"; @@ -761,6 +833,10 @@ main( int argc, char** argv ) rndgentxt = "CURAND HOST"; else if( rndgen == RandomNumberMode::CurandDevice ) rndgentxt = "CURAND DEVICE"; + else if( rndgen == RandomNumberMode::HiprandHost ) + rndgentxt = "ROCRAND HOST"; + else if( rndgen == RandomNumberMode::HiprandDevice ) + rndgentxt = "ROCRAND DEVICE"; #ifdef __CUDACC__ rndgentxt += " (CUDA code)"; #elif defined __HIPCC__ @@ -788,7 +864,7 @@ main( int argc, char** argv ) wrkflwtxt += "FLT+"; #else wrkflwtxt += "???+"; // no path to this statement -#endif /* clang-format on */ +#endif // -- CUCOMPLEX or THRUST or STD or CXSIMPLE complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -813,7 +889,7 @@ main( int argc, char** argv ) wrkflwtxt += "CXS:"; #else wrkflwtxt += "???:"; // no path to this statement -#endif +#endif /* clang-format on */ #endif // -- COMMON or CURAND HOST or CURAND DEVICE random numbers? if( rndgen == RandomNumberMode::CommonRandom ) @@ -822,6 +898,10 @@ main( int argc, char** argv ) wrkflwtxt += "CURHST+"; else if( rndgen == RandomNumberMode::CurandDevice ) wrkflwtxt += "CURDEV+"; + else if( rndgen == RandomNumberMode::HiprandHost ) + wrkflwtxt += "HIRHST+"; + else if( rndgen == RandomNumberMode::HiprandDevice ) + wrkflwtxt += "HIRDEV+"; else wrkflwtxt += "??????+"; // no path to this statement // -- HOST or DEVICE rambo sampling? @@ -1099,7 +1179,7 @@ main( int argc, char** argv ) #ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif - << "\"Curand generation\": " + << "\"Random generation\": " << "\"" << rndgentxt << "\"," << std::endl; double minelem = hstStats.minME; diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/RandomNumberKernels.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/RandomNumberKernels.h index 21d63beeac..7ed728a26c 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/RandomNumberKernels.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/RandomNumberKernels.h @@ -1,21 +1,22 @@ -// Copyright (C) 2020-2023 CERN and UCLouvain. +// Copyright (C) 2020-2024 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. #ifndef RANDOMNUMBERKERNELS_H #define RANDOMNUMBERKERNELS_H 1 #include "mgOnGpuConfig.h" -// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when MGONGPUCPP_GPUIMPL is not defined -#ifndef MGONGPU_HAS_NO_CURAND -//#include "curand.h" -struct curandGenerator_st; // forward definition from curand.h -#endif - #include "MemoryBuffers.h" +// Forward definition from curand.h (the full header is only needed in CurandRandomKernel.cc) +struct curandGenerator_st; + +// Forward definition from hiprand.h (the full header is only needed in HiprandRandomKernel.cc) +struct rocrand_generator_base_type; +typedef rocrand_generator_base_type hiprandGenerator_st; + #ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else @@ -107,7 +108,6 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef MGONGPU_HAS_NO_CURAND // A class encapsulating CURAND random number generation on a CPU host or on a GPU device class CurandRandomNumberKernel final : public RandomNumberKernelBase { @@ -142,11 +142,49 @@ namespace mg5amcCpu const bool m_isOnDevice; // The curand generator - // (NB: curand.h defines typedef generator_t as a pointer to forward-defined 'struct curandGenerator_st') + // (NB: curand.h defines typedef curandGenerator_t as a pointer to forward-defined 'struct curandGenerator_st') curandGenerator_st* m_rnGen; }; -#endif + //-------------------------------------------------------------------------- + + // A class encapsulating HIPRAND random number generation on a CPU host or on a GPU device + class HiprandRandomNumberKernel final : public RandomNumberKernelBase + { + public: + + // Constructor from an existing output buffer + HiprandRandomNumberKernel( BufferRndNumMomenta& rnarray, const bool onDevice ); + + // Destructor + ~HiprandRandomNumberKernel(); + + // Seed the random number generator + void seedGenerator( const unsigned int seed ) override final; + + // Generate the random number array + void generateRnarray() override final; + + // Is this a host or device kernel? + bool isOnDevice() const override final { return m_isOnDevice; } + + private: + + // Create the generator (workaround for #429: do this in every seedGenerator call rather than only in the ctor) + void createGenerator(); + + // Destroy the generator (workaround for #429: do this in every seedGenerator call rather than only in the ctor) + void destroyGenerator(); + + private: + + // Is this a host or device kernel? + const bool m_isOnDevice; + + // The hiprand generator + // (NB: hiprand.h defines typedef hiprandGenerator_t as a pointer to forward-defined 'struct hiprandGenerator_st') + hiprandGenerator_st* m_rnGen; + }; //-------------------------------------------------------------------------- } diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk index 1077bdc098..3ad91dfd59 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk @@ -173,11 +173,6 @@ ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) comma:=, CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch)) CUINC = -I$(CUDA_HOME)/include/ - ifeq ($(RNDGEN),hasNoCurand) - CURANDLIBFLAGS= - else - CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here! - endif CUOPTFLAGS = -lineinfo ###GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math GPUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math @@ -241,7 +236,7 @@ else override GPUCC= override USE_NVTX= override CUINC= - override CURANDLIBFLAGS= + override HIPINC= endif @@ -291,7 +286,7 @@ endif #------------------------------------------------------------------------------- -#=== Configure defaults and check if user-defined choices exist for OMPFLAGS, AVX, FPTYPE, HELINL, HRDCOD, RNDGEN +#=== Configure defaults and check if user-defined choices exist for OMPFLAGS, AVX, FPTYPE, HELINL, HRDCOD # Set the default OMPFLAGS choice ifneq ($(findstring hipcc,$(GPUCC)),) @@ -352,29 +347,62 @@ ifeq ($(HRDCOD),) override HRDCOD = 0 endif -# Set the default RNDGEN (random number generator) choice -ifeq ($(RNDGEN),) - ifeq ($(GPUCC),) - override RNDGEN = hasNoCurand - # Edgecase for HIP compilation - else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) - override RNDGEN = hasNoCurand - else ifeq ($(RNDGEN),) - override RNDGEN = hasCurand - endif -endif - -# Export AVX, FPTYPE, HELINL, HRDCOD, RNDGEN, OMPFLAGS so that it is not necessary to pass them to the src Makefile too +# Export AVX, FPTYPE, HELINL, HRDCOD, OMPFLAGS so that it is not necessary to pass them to the src Makefile too export AVX export FPTYPE export HELINL export HRDCOD -export RNDGEN export OMPFLAGS #------------------------------------------------------------------------------- -#=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN +#=== Configure defaults and check if user-defined choices exist for RNDGEN (legacy!), HASCURAND, HASHIPRAND + +# If the legacy RNDGEN exists, this take precedence over any HASCURAND choice (but a warning is printed out) +###$(info RNDGEN=$(RNDGEN)) +ifneq ($(RNDGEN),) + $(warning Environment variable RNDGEN is no longer supported, please use HASCURAND instead!) + ifeq ($(RNDGEN),hasCurand) + override HASCURAND = $(RNDGEN) + else ifeq ($(RNDGEN),hasNoCurand) + override HASCURAND = $(RNDGEN) + else ifneq ($(RNDGEN),hasNoCurand) + $(error Unknown RNDGEN='$(RNDGEN)': only 'hasCurand' and 'hasNoCurand' are supported - but use HASCURAND instead!) + endif +endif + +# Set the default HASCURAND (curand random number generator) choice, if no prior choice exists for HASCURAND +# (NB: allow HASCURAND=hasCurand even if $(GPUCC) does not point to nvcc: assume CUDA_HOME was defined correctly...) +ifeq ($(HASCURAND),) + ifeq ($(GPUCC),) # CPU-only build + override HASCURAND = hasNoCurand + else ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build + override HASCURAND = hasCurand + else # non-Nvidia GPU build + override HASCURAND = hasNoCurand + endif +endif + +# Set the default HASHIPRAND (hiprand random number generator) choice, if no prior choice exists for HASHIPRAND +# (NB: allow HASHIPRAND=hasHiprand even if $(GPUCC) does not point to hipcc: assume HIP_HOME was defined correctly...) +ifeq ($(HASHIPRAND),) + ifeq ($(GPUCC),) # CPU-only build + override HASHIPRAND = hasNoHiprand + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build + override HASHIPRAND = hasHiprand + else # non-AMD GPU build + override HASHIPRAND = hasNoHiprand + endif +endif + +# Export HASCURAND, HASHIPRAND so that it is not necessary to pass them to the src Makefile too +# (NB: these variables in cudacpp_src.mk are only used to define the build tag, they are NOT needed for RNDCXXFLAGS or RNDLIBFLAGS) +export HASCURAND +export HASHIPRAND + +#------------------------------------------------------------------------------- + +#=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD # Set the build flags appropriate to OMPFLAGS $(info OMPFLAGS=$(OMPFLAGS)) @@ -432,13 +460,13 @@ CXXFLAGS+= $(AVXFLAGS) $(info FPTYPE=$(FPTYPE)) ifeq ($(FPTYPE),d) CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE - GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE + GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE else ifeq ($(FPTYPE),f) CXXFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT - GPUFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT + GPUFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT else ifeq ($(FPTYPE),m) CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT - GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT + GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT else $(error Unknown FPTYPE='$(FPTYPE)': only 'd', 'f' and 'm' are supported) endif @@ -447,7 +475,7 @@ endif $(info HELINL=$(HELINL)) ifeq ($(HELINL),1) CXXFLAGS += -DMGONGPU_INLINE_HELAMPS - GPUFLAGS += -DMGONGPU_INLINE_HELAMPS + GPUFLAGS += -DMGONGPU_INLINE_HELAMPS else ifneq ($(HELINL),0) $(error Unknown HELINL='$(HELINL)': only '0' and '1' are supported) endif @@ -456,21 +484,40 @@ endif $(info HRDCOD=$(HRDCOD)) ifeq ($(HRDCOD),1) CXXFLAGS += -DMGONGPU_HARDCODE_PARAM - GPUFLAGS += -DMGONGPU_HARDCODE_PARAM + GPUFLAGS += -DMGONGPU_HARDCODE_PARAM else ifneq ($(HRDCOD),0) $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported) endif -# Set the build flags appropriate to each RNDGEN choice (example: "make RNDGEN=hasNoCurand") -$(info RNDGEN=$(RNDGEN)) -ifeq ($(RNDGEN),hasNoCurand) - override CXXFLAGSCURAND = -DMGONGPU_HAS_NO_CURAND -else ifeq ($(RNDGEN),hasCurand) - override CXXFLAGSCURAND = + +#=== Set the CUDA/HIP/C++ compiler and linker flags appropriate to user-defined choices of HASCURAND, HASHIPRAND + +$(info HASCURAND=$(HASCURAND)) +$(info HASHIPRAND=$(HASHIPRAND)) +override RNDCXXFLAGS= +override RNDLIBFLAGS= + +# Set the RNDCXXFLAGS and RNDLIBFLAGS build flags appropriate to each HASCURAND choice (example: "make HASCURAND=hasNoCurand") +ifeq ($(HASCURAND),hasNoCurand) + override RNDCXXFLAGS += -DMGONGPU_HAS_NO_CURAND +else ifeq ($(HASCURAND),hasCurand) + override RNDLIBFLAGS += -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here! else - $(error Unknown RNDGEN='$(RNDGEN)': only 'hasCurand' and 'hasNoCurand' are supported) + $(error Unknown HASCURAND='$(HASCURAND)': only 'hasCurand' and 'hasNoCurand' are supported) +endif + +# Set the RNDCXXFLAGS and RNDLIBFLAGS build flags appropriate to each HASHIPRAND choice (example: "make HASHIPRAND=hasNoHiprand") +ifeq ($(HASHIPRAND),hasNoHiprand) + override RNDCXXFLAGS += -DMGONGPU_HAS_NO_HIPRAND +else ifeq ($(HASHIPRAND),hasHiprand) + override RNDLIBFLAGS += -L$(HIP_HOME)/lib/ -lhiprand +else ifneq ($(HASHIPRAND),hasHiprand) + $(error Unknown HASHIPRAND='$(HASHIPRAND)': only 'hasHiprand' and 'hasNoHiprand' are supported) endif +#$(info RNDCXXFLAGS=$(RNDCXXFLAGS)) +#$(info HASHIPRAND=$(HASHIPRAND)) + #------------------------------------------------------------------------------- #=== Configure build directories and build lockfiles === @@ -481,7 +528,7 @@ override DIRTAG = $(AVX)_$(FPTYPE)_inl$(HELINL)_hrd$(HRDCOD) # Build lockfile "full" tag (defines full specification of build options that cannot be intermixed) # (Rationale: avoid mixing of CUDA and no-CUDA environment builds with different random number generators) -override TAG = $(AVX)_$(FPTYPE)_inl$(HELINL)_hrd$(HRDCOD)_$(RNDGEN) +override TAG = $(AVX)_$(FPTYPE)_inl$(HELINL)_hrd$(HRDCOD)_$(HASCURAND)_$(HASHIPRAND) # Build directory: current directory by default, or build.$(DIRTAG) if USEBUILDDIR==1 ifeq ($(USEBUILDDIR),1) @@ -589,14 +636,19 @@ endif $(BUILDDIR)/check_sa.o: CXXFLAGS += $(USE_NVTX) $(CUINC) $(BUILDDIR)/check_sa_cu.o: CXXFLAGS += $(USE_NVTX) $(CUINC) -# Apply special build flags only to check_sa[_cu].o and CurandRandomNumberKernel[_cu].o (curand headers, #679) -$(BUILDDIR)/check_sa.o: CXXFLAGS += $(CXXFLAGSCURAND) -$(BUILDDIR)/check_sa_cu.o: CUFLAGS += $(CXXFLAGSCURAND) -$(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CXXFLAGSCURAND) -$(BUILDDIR)/CurandRandomNumberKernel_cu.o: CUFLAGS += $(CXXFLAGSCURAND) -ifeq ($(RNDGEN),hasCurand) +# Apply special build flags only to check_sa[_cu].o and (Cu|Hip)randRandomNumberKernel[_cu].o +$(BUILDDIR)/check_sa.o: CXXFLAGS += $(RNDCXXFLAGS) +$(BUILDDIR)/check_sa_cu.o: CUFLAGS += $(RNDCXXFLAGS) +$(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(RNDCXXFLAGS) +$(BUILDDIR)/CurandRandomNumberKernel_cu.o: CUFLAGS += $(RNDCXXFLAGS) +$(BUILDDIR)/HiprandRandomNumberKernel.o: CXXFLAGS += $(RNDCXXFLAGS) +$(BUILDDIR)/HiprandRandomNumberKernel_cu.o: CUFLAGS += $(RNDCXXFLAGS) +ifeq ($(HASCURAND),hasCurand) # curand headers, #679 $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUINC) endif +ifeq ($(HASHIPRAND),hasHiprand) # hiprand headers +$(BUILDDIR)/HiprandRandomNumberKernel.o: CXXFLAGS += $(HIPINC) +endif # Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592) ifneq ($(shell $(CXX) --version | egrep '^(Intel)'),) @@ -673,8 +725,8 @@ endif # Target (and build rules): C++ and CUDA standalone executables $(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o - $(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CURANDLIBFLAGS) +$(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(BUILDDIR)/HiprandRandomNumberKernel.o + $(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(BUILDDIR)/HiprandRandomNumberKernel.o $(RNDLIBFLAGS) ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) @@ -684,8 +736,8 @@ else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(cu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif $(cu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(cu_main): $(BUILDDIR)/check_sa_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_cu.o - $(GPUCC) -o $@ $(BUILDDIR)/check_sa_cu.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_cu.o $(CURANDLIBFLAGS) +$(cu_main): $(BUILDDIR)/check_sa_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_cu.o $(BUILDDIR)/HiprandRandomNumberKernel_cu.o + $(GPUCC) -o $@ $(BUILDDIR)/check_sa_cu.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_cu.o $(BUILDDIR)/HiprandRandomNumberKernel_cu.o $(RNDLIBFLAGS) endif #------------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/runTest.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/runTest.cc index de327f2321..7f248d29a4 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/runTest.cc +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/runTest.cc @@ -238,7 +238,7 @@ struct CUDATest : public CUDA_CPU_TestBase return MemoryAccessMatrixElements::ieventAccessConst( hstMatrixElements.data(), ievt ); } }; -#endif +#endif /* clang-format off */ // Use two levels of macros to force stringification at the right level // (see https://gcc.gnu.org/onlinedocs/gcc-3.0.1/cpp_3.html#SEC17 and https://stackoverflow.com/a/3419392) @@ -260,4 +260,4 @@ INSTANTIATE_TEST_SUITE_P( prefix, \ MG_INSTANTIATE_TEST_SUITE_GPU( XTESTID_GPU( MG_EPOCH_PROCESS_ID ), MadgraphTest ); #else MG_INSTANTIATE_TEST_SUITE_CPU( XTESTID_CPU( MG_EPOCH_PROCESS_ID ), MadgraphTest ); -#endif +#endif /* clang-format on */ diff --git a/epochX/cudacpp/gg_tt.mad/src/cudacpp_src.mk b/epochX/cudacpp/gg_tt.mad/src/cudacpp_src.mk index b2b9da5288..d7d51259d2 100644 --- a/epochX/cudacpp/gg_tt.mad/src/cudacpp_src.mk +++ b/epochX/cudacpp/gg_tt.mad/src/cudacpp_src.mk @@ -45,13 +45,13 @@ endif #------------------------------------------------------------------------------- -#=== Configure the CUDA compiler (note: GPUCC is already exported including ccache) +#=== Configure the CUDA compiler (note: GPUCC have been exported from cudacpp.mk including ccache) ###$(info GPUCC=$(GPUCC)) #------------------------------------------------------------------------------- -#=== Configure ccache for C++ builds (note: GPUCC is already exported including ccache) +#=== Configure ccache for C++ builds (note: GPUCC have been exported from cudacpp.mk including ccache) # Enable ccache if USECCACHE=1 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) @@ -86,7 +86,8 @@ endif #------------------------------------------------------------------------------- -#=== Set the CUDA/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN +#=== Set the CUDA/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD (exported from cudacpp.mk) +#=== (NB the RNDCXXFLAGS and RNDLIBFLAGS appropriate to user-defined choices of HASCURAND and HASHIPRAND have been exported from cudacpp.mk) # Set the build flags appropriate to OMPFLAGS ###$(info OMPFLAGS=$(OMPFLAGS)) @@ -175,14 +176,6 @@ else ifneq ($(HRDCOD),0) $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported) endif -# Set the build flags appropriate to each RNDGEN choice (example: "make RNDGEN=hasNoCurand") -###$(info RNDGEN=$(RNDGEN)) -ifeq ($(RNDGEN),hasNoCurand) - CXXFLAGS += -DMGONGPU_HAS_NO_CURAND -else ifneq ($(RNDGEN),hasCurand) - $(error Unknown RNDGEN='$(RNDGEN)': only 'hasCurand' and 'hasNoCurand' are supported) -endif - #------------------------------------------------------------------------------- #=== Configure build directories and build lockfiles === @@ -193,7 +186,7 @@ override DIRTAG = $(AVX)_$(FPTYPE)_inl$(HELINL)_hrd$(HRDCOD) # Build lockfile "full" tag (defines full specification of build options that cannot be intermixed) # (Rationale: avoid mixing of CUDA and no-CUDA environment builds with different random number generators) -override TAG = $(AVX)_$(FPTYPE)_inl$(HELINL)_hrd$(HRDCOD)_$(RNDGEN) +override TAG = $(AVX)_$(FPTYPE)_inl$(HELINL)_hrd$(HRDCOD)_$(HASCURAND)_$(HASHIPRAND) # Build directory: current directory by default, or build.$(DIRTAG) if USEBUILDDIR==1 ###$(info Current directory is $(shell pwd)) diff --git a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuConfig.h index 6bde4466d0..33306bc3e2 100644 --- a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuConfig.h @@ -20,11 +20,15 @@ #undef MGONGPUCPP_GPUIMPL #endif +// Make sure that __HIP_PLATFORM_NVIDIA__ is undefined +// (__HIP_PLATFORM_AMD__ is defined by hipcc or in HiprandRandomNumberKernel.cc) +#undef __HIP_PLATFORM_NVIDIA__ // disable hiprand for NVidia (curand) + // ** NB1 Throughputs (e.g. 6.8E8) are events/sec for "./gcheck.exe -p 65536 128 12" // ** NB2 Baseline on b7g47n0004 fluctuates (probably depends on load on other VMs) // Choose if curand is supported for generating random numbers -// For HIP, by default, do not use curand (common random numbers will be used instead) +// For HIP, by default, do not allow curand to be used (hiprand or common random numbers will be used instead) // For both CUDA and C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND // (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784 and #785) #if defined __HIPCC__ @@ -39,6 +43,22 @@ //#endif #endif +// Choose if hiprand is supported for generating random numbers +// For CUDA, by default, do not allow hiprand to be used (curand or common random numbers will be used instead) +// For both HIP and C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_HIPRAND +// (there may exist HIP installations which do not include hiprand?) +#if defined __CUDACC__ +#define MGONGPU_HAS_NO_HIPRAND 1 +#else +//#ifdef __HIPCC__ +//#undef MGONGPU_HAS_NO_HIPRAND // default +////#define MGONGPU_HAS_NO_HIPRAND 1 +//#else +//#undef MGONGPU_HAS_NO_HIPRAND // default +////#define MGONGPU_HAS_NO_HIPRAND 1 +//#endif +#endif + // Choose floating point precision (for everything but color algebra #537) // If one of these macros has been set from outside with e.g. -DMGONGPU_FPTYPE_FLOAT, nothing happens (issue #167) #if not defined MGONGPU_FPTYPE_DOUBLE and not defined MGONGPU_FPTYPE_FLOAT diff --git a/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt b/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt index f34d33a342..04ee1fae0a 100644 --- a/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt +++ b/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005566120147705078  +DEBUG: model prefixing takes 0.005332469940185547  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -163,26 +163,26 @@ Load PLUGIN.CUDACPP_OUTPUT It has been validated for the last time with version: 3.5.2 Output will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: cformat =  plugin [export_cpp.py at line 3071]  -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 160]  -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 165]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 161]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 166]  INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1 INFO: Processing color information for process: g g > t t~ @1 -DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 194]  -DEBUG: type(subproc_group)= [output.py at line 195]  -DEBUG: type(fortran_model)= [output.py at line 196]  -DEBUG: type(me)= me=0 [output.py at line 197]  +DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 195]  +DEBUG: type(subproc_group)= [output.py at line 196]  +DEBUG: type(fortran_model)= [output.py at line 197]  +DEBUG: type(me)= me=0 [output.py at line 198]  INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/./CPPProcess.h FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/./CPPProcess.cc INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/. Generated helas calls for 1 subprocesses (3 diagrams) in 0.006 s -DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202]  +DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 203]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 2 routines in 0.149 s +ALOHA: aloha creates 2 routines in 0.143 s VVV1 FFV1 FFV1 @@ -197,7 +197,7 @@ INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. quit -real 0m0.558s -user 0m0.498s -sys 0m0.048s -Code generation completed in 1 seconds +real 0m0.640s +user 0m0.463s +sys 0m0.061s +Code generation completed in 0 seconds diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/CurandRandomNumberKernel.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/CurandRandomNumberKernel.cc index 08a16f6f2c..c160c5e06b 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/CurandRandomNumberKernel.cc +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/CurandRandomNumberKernel.cc @@ -10,6 +10,7 @@ #include #ifndef MGONGPU_HAS_NO_CURAND /* clang-format off */ +// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when MGONGPUCPP_GPUIMPL is not defined #include "curand.h" #define checkCurand( code ){ assertCurand( code, __FILE__, __LINE__ ); } inline void assertCurand( curandStatus_t code, const char *file, int line, bool abort = true ) diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/HiprandRandomNumberKernel.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/HiprandRandomNumberKernel.cc new file mode 100644 index 0000000000..2e4534f9d4 --- /dev/null +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/HiprandRandomNumberKernel.cc @@ -0,0 +1,145 @@ +// Copyright (C) 2020-2024 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Jan 2024) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2024) for the MG5aMC CUDACPP plugin. + +#include "mgOnGpuConfig.h" + +#include "GpuRuntime.h" +#include "MemoryBuffers.h" +#include "RandomNumberKernels.h" + +#include + +#ifndef MGONGPU_HAS_NO_HIPRAND /* clang-format off */ +#ifndef __HIP_PLATFORM_AMD__ +#define __HIP_PLATFORM_AMD__ 1 // enable hiprand for AMD (rocrand) +#endif +#include +#define checkHiprand( code ){ assertHiprand( code, __FILE__, __LINE__ ); } +inline void assertHiprand( hiprandStatus_t code, const char *file, int line, bool abort = true ) +{ + if ( code != HIPRAND_STATUS_SUCCESS ) + { + printf( "HiprandAssert: %s:%d code=%d\n", file, line, code ); + if ( abort ) assert( code == HIPRAND_STATUS_SUCCESS ); + } +} +#endif /* clang-format on */ + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + //-------------------------------------------------------------------------- +#ifndef MGONGPU_HAS_NO_HIPRAND + HiprandRandomNumberKernel::HiprandRandomNumberKernel( BufferRndNumMomenta& rnarray, const bool onDevice ) + : RandomNumberKernelBase( rnarray ) + , m_isOnDevice( onDevice ) + { + if( m_isOnDevice ) + { +#ifdef MGONGPUCPP_GPUIMPL + if( !m_rnarray.isOnDevice() ) + throw std::runtime_error( "HiprandRandomNumberKernel on device with a host random number array" ); +#else + throw std::runtime_error( "HiprandRandomNumberKernel does not support HiprandDevice on CPU host" ); +#endif + } + else + { + if( m_rnarray.isOnDevice() ) + throw std::runtime_error( "HiprandRandomNumberKernel on host with a device random number array" ); + } + createGenerator(); + } + + //-------------------------------------------------------------------------- + + HiprandRandomNumberKernel::~HiprandRandomNumberKernel() + { + destroyGenerator(); + } + + //-------------------------------------------------------------------------- + + void HiprandRandomNumberKernel::seedGenerator( const unsigned int seed ) + { + if( m_isOnDevice ) + { + destroyGenerator(); // workaround for #429 + createGenerator(); // workaround for #429 + } + //printf( "seedGenerator: seed %d\n", seed ); + checkHiprand( hiprandSetPseudoRandomGeneratorSeed( m_rnGen, seed ) ); + } + + //-------------------------------------------------------------------------- + + void HiprandRandomNumberKernel::createGenerator() + { + //const hiprandRngType_t type = HIPRAND_RNG_PSEUDO_DEFAULT; + //const hiprandRngType_t type = HIPRAND_RNG_PSEUDO_XORWOW; + //const hiprandRngType_t type = HIPRAND_RNG_PSEUDO_MRG32K3A; + const hiprandRngType_t type = HIPRAND_RNG_PSEUDO_MTGP32; // same as curand; not implemented yet (code=1000) in host code + //const hiprandRngType_t type = HIPRAND_RNG_PSEUDO_MT19937; + //const hiprandRngType_t type = HIPRAND_RNG_PSEUDO_PHILOX4_32_10; + if( m_isOnDevice ) + { + checkHiprand( hiprandCreateGenerator( &m_rnGen, type ) ); + } + else + { + // See https://github.com/ROCm/hipRAND/issues/76 + throw std::runtime_error( "HiprandRandomNumberKernel on host is not supported yet (hiprandCreateGeneratorHost is not implemented yet)" ); + //checkHiprand( hiprandCreateGeneratorHost( &m_rnGen, type ) ); // ALWAYS FAILS WITH CODE=1000 + } + // FIXME: hiprand ordering is not implemented yet + // See https://github.com/ROCm/hipRAND/issues/75 + /* + //checkHiprand( hiprandSetGeneratorOrdering( *&m_rnGen, HIPRAND_ORDERING_PSEUDO_LEGACY ) ); + checkHiprand( hiprandSetGeneratorOrdering( *&m_rnGen, HIPRAND_ORDERING_PSEUDO_BEST ) ); + //checkHiprand( hiprandSetGeneratorOrdering( *&m_rnGen, HIPRAND_ORDERING_PSEUDO_DYNAMIC ) ); + //checkHiprand( hiprandSetGeneratorOrdering( *&m_rnGen, HIPRAND_ORDERING_PSEUDO_SEEDED ) ); + */ + } + + //-------------------------------------------------------------------------- + + void HiprandRandomNumberKernel::destroyGenerator() + { + checkHiprand( hiprandDestroyGenerator( m_rnGen ) ); + } + + //-------------------------------------------------------------------------- + + void HiprandRandomNumberKernel::generateRnarray() + { +#if defined MGONGPU_FPTYPE_DOUBLE + checkHiprand( hiprandGenerateUniformDouble( m_rnGen, m_rnarray.data(), m_rnarray.size() ) ); +#elif defined MGONGPU_FPTYPE_FLOAT + checkHiprand( hiprandGenerateUniform( m_rnGen, m_rnarray.data(), m_rnarray.size() ) ); +#endif + /* + printf( "\nHiprandRandomNumberKernel::generateRnarray size = %d\n", (int)m_rnarray.size() ); + fptype* data = m_rnarray.data(); +#ifdef MGONGPUCPP_GPUIMPL + if( m_rnarray.isOnDevice() ) + { + data = new fptype[m_rnarray.size()](); + checkCuda( cudaMemcpy( data, m_rnarray.data(), m_rnarray.bytes(), cudaMemcpyDeviceToHost ) ); + } +#endif + for( int i = 0; i < ( (int)m_rnarray.size() / 4 ); i++ ) + printf( "[%4d] %f %f %f %f\n", i * 4, data[i * 4], data[i * 4 + 2], data[i * 4 + 2], data[i * 4 + 3] ); +#ifdef MGONGPUCPP_GPUIMPL + if( m_rnarray.isOnDevice() ) delete[] data; +#endif + */ + } + + //-------------------------------------------------------------------------- +#endif +} diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/HiprandRandomNumberKernel.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/HiprandRandomNumberKernel.cc new file mode 120000 index 0000000000..6691864f78 --- /dev/null +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/HiprandRandomNumberKernel.cc @@ -0,0 +1 @@ +../HiprandRandomNumberKernel.cc \ No newline at end of file diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/check_sa.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/check_sa.cc index bde384c69e..e086ae12d9 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/check_sa.cc +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/check_sa.cc @@ -1,10 +1,10 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2023 CERN and UCLouvain. +// Copyright (C) 2020-2024 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -58,7 +58,7 @@ int usage( char* argv0, int ret = 1 ) { std::cout << "Usage: " << argv0 - << " [--verbose|-v] [--debug|-d] [--performance|-p] [--json|-j] [--curhst|--curdev|--common] [--rmbhst|--rmbdev] [--bridge]" + << " [--verbose|-v] [--debug|-d] [--performance|-p] [--json|-j] [--curhst|--curdev|--hirhst|--hirdev|--common] [--rmbhst|--rmbdev] [--bridge]" << " [#gpuBlocksPerGrid #gpuThreadsPerBlock] #iterations" << std::endl; std::cout << std::endl; std::cout << "The number of events per iteration is #gpuBlocksPerGrid * #gpuThreadsPerBlock" << std::endl; @@ -131,17 +131,31 @@ main( int argc, char** argv ) enum class RandomNumberMode { CommonRandom = 0, - CurandHost = 1, - CurandDevice = 2 + CurandHost = -1, + CurandDevice = 1, + HiprandHost = -2, + HiprandDevice = 2 }; -#ifdef MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785) -#elif defined __HIPCC__ -#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random -#elif defined __CUDACC__ +#if defined __CUDACC__ +#ifndef MGONGPU_HAS_NO_CURAND RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand #else + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on NVidia GPU if build has no curand (PR #784 and #785) +#endif +#elif defined __HIPCC__ +#ifndef MGONGPU_HAS_NO_HIPRAND + RandomNumberMode rndgen = RandomNumberMode::HiprandDevice; // default on AMD GPU if build has hiprand +#else + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on AMD GPU if build has no hiprand +#endif +#else +#ifndef MGONGPU_HAS_NO_CURAND RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand +#elif not defined MGONGPU_HAS_NO_HIPRAND + RandomNumberMode rndgen = RandomNumberMode::HiprandDevice; // default on CPU if build has hiprand +#else + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has neither curand nor hiprand +#endif #endif // Rambo sampling mode (NB RamboHost implies CommonRandom or CurandHost!) enum class RamboSamplingMode @@ -152,7 +166,7 @@ main( int argc, char** argv ) #ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -193,6 +207,26 @@ main( int argc, char** argv ) throw std::runtime_error( "CurandHost is not supported because this application was built without Curand support" ); #else rndgen = RandomNumberMode::CurandHost; +#endif + } + else if( arg == "--hirdev" ) + { +#ifndef __HIPCC__ + throw std::runtime_error( "HiprandDevice is not supported on CPUs or non-AMD GPUs" ); +#elif defined MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "HiprandDevice is not supported because this application was built without Hiprand support" ); +#else + rndgen = RandomNumberMode::HiprandDevice; +#endif + } + else if( arg == "--hirhst" ) + { +#ifdef MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "HiprandHost is not supported because this application was built without Hiprand support" ); +#else + // See https://github.com/ROCm/hipRAND/issues/76 + throw std::runtime_error( "HiprandRandomNumberKernel on host is not supported yet (hiprandCreateGeneratorHost is not implemented yet)" ); + //rndgen = RandomNumberMode::HiprandHost; #endif } else if( arg == "--common" ) @@ -265,6 +299,20 @@ main( int argc, char** argv ) #endif } + if( rmbsmp == RamboSamplingMode::RamboHost && rndgen == RandomNumberMode::HiprandDevice ) + { +#if not defined MGONGPU_HAS_NO_HIPRAND + // See https://github.com/ROCm/hipRAND/issues/76 + //std::cout << "WARNING! RamboHost selected: cannot use HiprandDevice, will use HiprandHost" << std::endl; + //rndgen = RandomNumberMode::HiprandHost; + std::cout << "WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet)" << std::endl; + rndgen = RandomNumberMode::CommonRandom; +#else + std::cout << "WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom" << std::endl; + rndgen = RandomNumberMode::CommonRandom; +#endif + } + constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout constexpr int neppR = MemoryAccessRandomNumbers::neppR; // AOSOA layout @@ -415,7 +463,7 @@ main( int argc, char** argv ) std::unique_ptr wavetimes( new double[niter] ); std::unique_ptr wv3atimes( new double[niter] ); - // --- 0c. Create curand or common generator + // --- 0c. Create curand, hiprand or common generator const std::string cgenKey = "0c GenCreat"; timermap.start( cgenKey ); // Allocate the appropriate RandomNumberKernel @@ -433,7 +481,7 @@ main( int argc, char** argv ) prnk.reset( new CurandRandomNumberKernel( hstRndmom, onDevice ) ); #endif } - else + else if( rndgen == RandomNumberMode::CurandDevice ) { #ifdef MGONGPU_HAS_NO_CURAND throw std::runtime_error( "INTERNAL ERROR! CurandDevice is not supported because this application was built without Curand support" ); // INTERNAL ERROR (no path to this statement) @@ -441,9 +489,31 @@ main( int argc, char** argv ) const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); #else - throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif } + else if( rndgen == RandomNumberMode::HiprandHost ) + { +#ifdef MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "INTERNAL ERROR! HiprandHost is not supported because this application was built without Hiprand support" ); // INTERNAL ERROR (no path to this statement) +#else + const bool onDevice = false; + prnk.reset( new HiprandRandomNumberKernel( hstRndmom, onDevice ) ); +#endif + } + else if( rndgen == RandomNumberMode::HiprandDevice ) + { +#ifdef MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "INTERNAL ERROR! HiprandDevice is not supported because this application was built without Hiprand support" ); // INTERNAL ERROR (no path to this statement) +#elif defined __HIPCC__ + const bool onDevice = true; + prnk.reset( new HiprandRandomNumberKernel( devRndmom, onDevice ) ); +#else + throw std::logic_error( "INTERNAL ERROR! HiprandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) +#endif + } + else + throw std::logic_error( "INTERNAL ERROR! Unknown rndgen value?" ); // INTERNAL ERROR (no path to this statement) // --- 0c. Create rambo sampling kernel [keep this in 0c for the moment] std::unique_ptr prsk; @@ -497,7 +567,7 @@ main( int argc, char** argv ) // *** START THE OLD-STYLE TIMER FOR RANDOM GEN *** double genrtime = 0; - // --- 1a. Seed rnd generator (to get same results on host and device in curand) + // --- 1a. Seed rnd generator (to get same results on host and device in curand/hiprand) // [NB This should not be necessary using the host API: "Generation functions // can be called multiple times on the same generator to generate successive // blocks of results. For pseudorandom generators, multiple calls to generation @@ -515,7 +585,9 @@ main( int argc, char** argv ) //std::cout << "Got random numbers" << std::endl; #ifdef MGONGPUCPP_GPUIMPL - if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) + if( rndgen != RandomNumberMode::CurandDevice && + rndgen != RandomNumberMode::HiprandDevice && + rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device const std::string htodKey = "1c CpHTDrnd"; @@ -761,6 +833,10 @@ main( int argc, char** argv ) rndgentxt = "CURAND HOST"; else if( rndgen == RandomNumberMode::CurandDevice ) rndgentxt = "CURAND DEVICE"; + else if( rndgen == RandomNumberMode::HiprandHost ) + rndgentxt = "ROCRAND HOST"; + else if( rndgen == RandomNumberMode::HiprandDevice ) + rndgentxt = "ROCRAND DEVICE"; #ifdef __CUDACC__ rndgentxt += " (CUDA code)"; #elif defined __HIPCC__ @@ -788,7 +864,7 @@ main( int argc, char** argv ) wrkflwtxt += "FLT+"; #else wrkflwtxt += "???+"; // no path to this statement -#endif /* clang-format on */ +#endif // -- CUCOMPLEX or THRUST or STD or CXSIMPLE complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -813,7 +889,7 @@ main( int argc, char** argv ) wrkflwtxt += "CXS:"; #else wrkflwtxt += "???:"; // no path to this statement -#endif +#endif /* clang-format on */ #endif // -- COMMON or CURAND HOST or CURAND DEVICE random numbers? if( rndgen == RandomNumberMode::CommonRandom ) @@ -822,6 +898,10 @@ main( int argc, char** argv ) wrkflwtxt += "CURHST+"; else if( rndgen == RandomNumberMode::CurandDevice ) wrkflwtxt += "CURDEV+"; + else if( rndgen == RandomNumberMode::HiprandHost ) + wrkflwtxt += "HIRHST+"; + else if( rndgen == RandomNumberMode::HiprandDevice ) + wrkflwtxt += "HIRDEV+"; else wrkflwtxt += "??????+"; // no path to this statement // -- HOST or DEVICE rambo sampling? @@ -1099,7 +1179,7 @@ main( int argc, char** argv ) #ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif - << "\"Curand generation\": " + << "\"Random generation\": " << "\"" << rndgentxt << "\"," << std::endl; double minelem = hstStats.minME; diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/RandomNumberKernels.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/RandomNumberKernels.h index 21d63beeac..7ed728a26c 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/RandomNumberKernels.h +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/RandomNumberKernels.h @@ -1,21 +1,22 @@ -// Copyright (C) 2020-2023 CERN and UCLouvain. +// Copyright (C) 2020-2024 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. #ifndef RANDOMNUMBERKERNELS_H #define RANDOMNUMBERKERNELS_H 1 #include "mgOnGpuConfig.h" -// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when MGONGPUCPP_GPUIMPL is not defined -#ifndef MGONGPU_HAS_NO_CURAND -//#include "curand.h" -struct curandGenerator_st; // forward definition from curand.h -#endif - #include "MemoryBuffers.h" +// Forward definition from curand.h (the full header is only needed in CurandRandomKernel.cc) +struct curandGenerator_st; + +// Forward definition from hiprand.h (the full header is only needed in HiprandRandomKernel.cc) +struct rocrand_generator_base_type; +typedef rocrand_generator_base_type hiprandGenerator_st; + #ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else @@ -107,7 +108,6 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef MGONGPU_HAS_NO_CURAND // A class encapsulating CURAND random number generation on a CPU host or on a GPU device class CurandRandomNumberKernel final : public RandomNumberKernelBase { @@ -142,11 +142,49 @@ namespace mg5amcCpu const bool m_isOnDevice; // The curand generator - // (NB: curand.h defines typedef generator_t as a pointer to forward-defined 'struct curandGenerator_st') + // (NB: curand.h defines typedef curandGenerator_t as a pointer to forward-defined 'struct curandGenerator_st') curandGenerator_st* m_rnGen; }; -#endif + //-------------------------------------------------------------------------- + + // A class encapsulating HIPRAND random number generation on a CPU host or on a GPU device + class HiprandRandomNumberKernel final : public RandomNumberKernelBase + { + public: + + // Constructor from an existing output buffer + HiprandRandomNumberKernel( BufferRndNumMomenta& rnarray, const bool onDevice ); + + // Destructor + ~HiprandRandomNumberKernel(); + + // Seed the random number generator + void seedGenerator( const unsigned int seed ) override final; + + // Generate the random number array + void generateRnarray() override final; + + // Is this a host or device kernel? + bool isOnDevice() const override final { return m_isOnDevice; } + + private: + + // Create the generator (workaround for #429: do this in every seedGenerator call rather than only in the ctor) + void createGenerator(); + + // Destroy the generator (workaround for #429: do this in every seedGenerator call rather than only in the ctor) + void destroyGenerator(); + + private: + + // Is this a host or device kernel? + const bool m_isOnDevice; + + // The hiprand generator + // (NB: hiprand.h defines typedef hiprandGenerator_t as a pointer to forward-defined 'struct hiprandGenerator_st') + hiprandGenerator_st* m_rnGen; + }; //-------------------------------------------------------------------------- } diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk index 1077bdc098..3ad91dfd59 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk @@ -173,11 +173,6 @@ ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) comma:=, CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch)) CUINC = -I$(CUDA_HOME)/include/ - ifeq ($(RNDGEN),hasNoCurand) - CURANDLIBFLAGS= - else - CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here! - endif CUOPTFLAGS = -lineinfo ###GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math GPUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math @@ -241,7 +236,7 @@ else override GPUCC= override USE_NVTX= override CUINC= - override CURANDLIBFLAGS= + override HIPINC= endif @@ -291,7 +286,7 @@ endif #------------------------------------------------------------------------------- -#=== Configure defaults and check if user-defined choices exist for OMPFLAGS, AVX, FPTYPE, HELINL, HRDCOD, RNDGEN +#=== Configure defaults and check if user-defined choices exist for OMPFLAGS, AVX, FPTYPE, HELINL, HRDCOD # Set the default OMPFLAGS choice ifneq ($(findstring hipcc,$(GPUCC)),) @@ -352,29 +347,62 @@ ifeq ($(HRDCOD),) override HRDCOD = 0 endif -# Set the default RNDGEN (random number generator) choice -ifeq ($(RNDGEN),) - ifeq ($(GPUCC),) - override RNDGEN = hasNoCurand - # Edgecase for HIP compilation - else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) - override RNDGEN = hasNoCurand - else ifeq ($(RNDGEN),) - override RNDGEN = hasCurand - endif -endif - -# Export AVX, FPTYPE, HELINL, HRDCOD, RNDGEN, OMPFLAGS so that it is not necessary to pass them to the src Makefile too +# Export AVX, FPTYPE, HELINL, HRDCOD, OMPFLAGS so that it is not necessary to pass them to the src Makefile too export AVX export FPTYPE export HELINL export HRDCOD -export RNDGEN export OMPFLAGS #------------------------------------------------------------------------------- -#=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN +#=== Configure defaults and check if user-defined choices exist for RNDGEN (legacy!), HASCURAND, HASHIPRAND + +# If the legacy RNDGEN exists, this take precedence over any HASCURAND choice (but a warning is printed out) +###$(info RNDGEN=$(RNDGEN)) +ifneq ($(RNDGEN),) + $(warning Environment variable RNDGEN is no longer supported, please use HASCURAND instead!) + ifeq ($(RNDGEN),hasCurand) + override HASCURAND = $(RNDGEN) + else ifeq ($(RNDGEN),hasNoCurand) + override HASCURAND = $(RNDGEN) + else ifneq ($(RNDGEN),hasNoCurand) + $(error Unknown RNDGEN='$(RNDGEN)': only 'hasCurand' and 'hasNoCurand' are supported - but use HASCURAND instead!) + endif +endif + +# Set the default HASCURAND (curand random number generator) choice, if no prior choice exists for HASCURAND +# (NB: allow HASCURAND=hasCurand even if $(GPUCC) does not point to nvcc: assume CUDA_HOME was defined correctly...) +ifeq ($(HASCURAND),) + ifeq ($(GPUCC),) # CPU-only build + override HASCURAND = hasNoCurand + else ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build + override HASCURAND = hasCurand + else # non-Nvidia GPU build + override HASCURAND = hasNoCurand + endif +endif + +# Set the default HASHIPRAND (hiprand random number generator) choice, if no prior choice exists for HASHIPRAND +# (NB: allow HASHIPRAND=hasHiprand even if $(GPUCC) does not point to hipcc: assume HIP_HOME was defined correctly...) +ifeq ($(HASHIPRAND),) + ifeq ($(GPUCC),) # CPU-only build + override HASHIPRAND = hasNoHiprand + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build + override HASHIPRAND = hasHiprand + else # non-AMD GPU build + override HASHIPRAND = hasNoHiprand + endif +endif + +# Export HASCURAND, HASHIPRAND so that it is not necessary to pass them to the src Makefile too +# (NB: these variables in cudacpp_src.mk are only used to define the build tag, they are NOT needed for RNDCXXFLAGS or RNDLIBFLAGS) +export HASCURAND +export HASHIPRAND + +#------------------------------------------------------------------------------- + +#=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD # Set the build flags appropriate to OMPFLAGS $(info OMPFLAGS=$(OMPFLAGS)) @@ -432,13 +460,13 @@ CXXFLAGS+= $(AVXFLAGS) $(info FPTYPE=$(FPTYPE)) ifeq ($(FPTYPE),d) CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE - GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE + GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE else ifeq ($(FPTYPE),f) CXXFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT - GPUFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT + GPUFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT else ifeq ($(FPTYPE),m) CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT - GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT + GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT else $(error Unknown FPTYPE='$(FPTYPE)': only 'd', 'f' and 'm' are supported) endif @@ -447,7 +475,7 @@ endif $(info HELINL=$(HELINL)) ifeq ($(HELINL),1) CXXFLAGS += -DMGONGPU_INLINE_HELAMPS - GPUFLAGS += -DMGONGPU_INLINE_HELAMPS + GPUFLAGS += -DMGONGPU_INLINE_HELAMPS else ifneq ($(HELINL),0) $(error Unknown HELINL='$(HELINL)': only '0' and '1' are supported) endif @@ -456,21 +484,40 @@ endif $(info HRDCOD=$(HRDCOD)) ifeq ($(HRDCOD),1) CXXFLAGS += -DMGONGPU_HARDCODE_PARAM - GPUFLAGS += -DMGONGPU_HARDCODE_PARAM + GPUFLAGS += -DMGONGPU_HARDCODE_PARAM else ifneq ($(HRDCOD),0) $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported) endif -# Set the build flags appropriate to each RNDGEN choice (example: "make RNDGEN=hasNoCurand") -$(info RNDGEN=$(RNDGEN)) -ifeq ($(RNDGEN),hasNoCurand) - override CXXFLAGSCURAND = -DMGONGPU_HAS_NO_CURAND -else ifeq ($(RNDGEN),hasCurand) - override CXXFLAGSCURAND = + +#=== Set the CUDA/HIP/C++ compiler and linker flags appropriate to user-defined choices of HASCURAND, HASHIPRAND + +$(info HASCURAND=$(HASCURAND)) +$(info HASHIPRAND=$(HASHIPRAND)) +override RNDCXXFLAGS= +override RNDLIBFLAGS= + +# Set the RNDCXXFLAGS and RNDLIBFLAGS build flags appropriate to each HASCURAND choice (example: "make HASCURAND=hasNoCurand") +ifeq ($(HASCURAND),hasNoCurand) + override RNDCXXFLAGS += -DMGONGPU_HAS_NO_CURAND +else ifeq ($(HASCURAND),hasCurand) + override RNDLIBFLAGS += -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here! else - $(error Unknown RNDGEN='$(RNDGEN)': only 'hasCurand' and 'hasNoCurand' are supported) + $(error Unknown HASCURAND='$(HASCURAND)': only 'hasCurand' and 'hasNoCurand' are supported) +endif + +# Set the RNDCXXFLAGS and RNDLIBFLAGS build flags appropriate to each HASHIPRAND choice (example: "make HASHIPRAND=hasNoHiprand") +ifeq ($(HASHIPRAND),hasNoHiprand) + override RNDCXXFLAGS += -DMGONGPU_HAS_NO_HIPRAND +else ifeq ($(HASHIPRAND),hasHiprand) + override RNDLIBFLAGS += -L$(HIP_HOME)/lib/ -lhiprand +else ifneq ($(HASHIPRAND),hasHiprand) + $(error Unknown HASHIPRAND='$(HASHIPRAND)': only 'hasHiprand' and 'hasNoHiprand' are supported) endif +#$(info RNDCXXFLAGS=$(RNDCXXFLAGS)) +#$(info HASHIPRAND=$(HASHIPRAND)) + #------------------------------------------------------------------------------- #=== Configure build directories and build lockfiles === @@ -481,7 +528,7 @@ override DIRTAG = $(AVX)_$(FPTYPE)_inl$(HELINL)_hrd$(HRDCOD) # Build lockfile "full" tag (defines full specification of build options that cannot be intermixed) # (Rationale: avoid mixing of CUDA and no-CUDA environment builds with different random number generators) -override TAG = $(AVX)_$(FPTYPE)_inl$(HELINL)_hrd$(HRDCOD)_$(RNDGEN) +override TAG = $(AVX)_$(FPTYPE)_inl$(HELINL)_hrd$(HRDCOD)_$(HASCURAND)_$(HASHIPRAND) # Build directory: current directory by default, or build.$(DIRTAG) if USEBUILDDIR==1 ifeq ($(USEBUILDDIR),1) @@ -589,14 +636,19 @@ endif $(BUILDDIR)/check_sa.o: CXXFLAGS += $(USE_NVTX) $(CUINC) $(BUILDDIR)/check_sa_cu.o: CXXFLAGS += $(USE_NVTX) $(CUINC) -# Apply special build flags only to check_sa[_cu].o and CurandRandomNumberKernel[_cu].o (curand headers, #679) -$(BUILDDIR)/check_sa.o: CXXFLAGS += $(CXXFLAGSCURAND) -$(BUILDDIR)/check_sa_cu.o: CUFLAGS += $(CXXFLAGSCURAND) -$(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CXXFLAGSCURAND) -$(BUILDDIR)/CurandRandomNumberKernel_cu.o: CUFLAGS += $(CXXFLAGSCURAND) -ifeq ($(RNDGEN),hasCurand) +# Apply special build flags only to check_sa[_cu].o and (Cu|Hip)randRandomNumberKernel[_cu].o +$(BUILDDIR)/check_sa.o: CXXFLAGS += $(RNDCXXFLAGS) +$(BUILDDIR)/check_sa_cu.o: CUFLAGS += $(RNDCXXFLAGS) +$(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(RNDCXXFLAGS) +$(BUILDDIR)/CurandRandomNumberKernel_cu.o: CUFLAGS += $(RNDCXXFLAGS) +$(BUILDDIR)/HiprandRandomNumberKernel.o: CXXFLAGS += $(RNDCXXFLAGS) +$(BUILDDIR)/HiprandRandomNumberKernel_cu.o: CUFLAGS += $(RNDCXXFLAGS) +ifeq ($(HASCURAND),hasCurand) # curand headers, #679 $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUINC) endif +ifeq ($(HASHIPRAND),hasHiprand) # hiprand headers +$(BUILDDIR)/HiprandRandomNumberKernel.o: CXXFLAGS += $(HIPINC) +endif # Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592) ifneq ($(shell $(CXX) --version | egrep '^(Intel)'),) @@ -673,8 +725,8 @@ endif # Target (and build rules): C++ and CUDA standalone executables $(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o - $(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CURANDLIBFLAGS) +$(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(BUILDDIR)/HiprandRandomNumberKernel.o + $(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(BUILDDIR)/HiprandRandomNumberKernel.o $(RNDLIBFLAGS) ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) @@ -684,8 +736,8 @@ else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(cu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif $(cu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(cu_main): $(BUILDDIR)/check_sa_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_cu.o - $(GPUCC) -o $@ $(BUILDDIR)/check_sa_cu.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_cu.o $(CURANDLIBFLAGS) +$(cu_main): $(BUILDDIR)/check_sa_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_cu.o $(BUILDDIR)/HiprandRandomNumberKernel_cu.o + $(GPUCC) -o $@ $(BUILDDIR)/check_sa_cu.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_cu.o $(BUILDDIR)/HiprandRandomNumberKernel_cu.o $(RNDLIBFLAGS) endif #------------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/runTest.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/runTest.cc index de327f2321..7f248d29a4 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/runTest.cc +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/runTest.cc @@ -238,7 +238,7 @@ struct CUDATest : public CUDA_CPU_TestBase return MemoryAccessMatrixElements::ieventAccessConst( hstMatrixElements.data(), ievt ); } }; -#endif +#endif /* clang-format off */ // Use two levels of macros to force stringification at the right level // (see https://gcc.gnu.org/onlinedocs/gcc-3.0.1/cpp_3.html#SEC17 and https://stackoverflow.com/a/3419392) @@ -260,4 +260,4 @@ INSTANTIATE_TEST_SUITE_P( prefix, \ MG_INSTANTIATE_TEST_SUITE_GPU( XTESTID_GPU( MG_EPOCH_PROCESS_ID ), MadgraphTest ); #else MG_INSTANTIATE_TEST_SUITE_CPU( XTESTID_CPU( MG_EPOCH_PROCESS_ID ), MadgraphTest ); -#endif +#endif /* clang-format on */ diff --git a/epochX/cudacpp/gg_tt.sa/src/cudacpp_src.mk b/epochX/cudacpp/gg_tt.sa/src/cudacpp_src.mk index b2b9da5288..d7d51259d2 100644 --- a/epochX/cudacpp/gg_tt.sa/src/cudacpp_src.mk +++ b/epochX/cudacpp/gg_tt.sa/src/cudacpp_src.mk @@ -45,13 +45,13 @@ endif #------------------------------------------------------------------------------- -#=== Configure the CUDA compiler (note: GPUCC is already exported including ccache) +#=== Configure the CUDA compiler (note: GPUCC have been exported from cudacpp.mk including ccache) ###$(info GPUCC=$(GPUCC)) #------------------------------------------------------------------------------- -#=== Configure ccache for C++ builds (note: GPUCC is already exported including ccache) +#=== Configure ccache for C++ builds (note: GPUCC have been exported from cudacpp.mk including ccache) # Enable ccache if USECCACHE=1 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) @@ -86,7 +86,8 @@ endif #------------------------------------------------------------------------------- -#=== Set the CUDA/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN +#=== Set the CUDA/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD (exported from cudacpp.mk) +#=== (NB the RNDCXXFLAGS and RNDLIBFLAGS appropriate to user-defined choices of HASCURAND and HASHIPRAND have been exported from cudacpp.mk) # Set the build flags appropriate to OMPFLAGS ###$(info OMPFLAGS=$(OMPFLAGS)) @@ -175,14 +176,6 @@ else ifneq ($(HRDCOD),0) $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported) endif -# Set the build flags appropriate to each RNDGEN choice (example: "make RNDGEN=hasNoCurand") -###$(info RNDGEN=$(RNDGEN)) -ifeq ($(RNDGEN),hasNoCurand) - CXXFLAGS += -DMGONGPU_HAS_NO_CURAND -else ifneq ($(RNDGEN),hasCurand) - $(error Unknown RNDGEN='$(RNDGEN)': only 'hasCurand' and 'hasNoCurand' are supported) -endif - #------------------------------------------------------------------------------- #=== Configure build directories and build lockfiles === @@ -193,7 +186,7 @@ override DIRTAG = $(AVX)_$(FPTYPE)_inl$(HELINL)_hrd$(HRDCOD) # Build lockfile "full" tag (defines full specification of build options that cannot be intermixed) # (Rationale: avoid mixing of CUDA and no-CUDA environment builds with different random number generators) -override TAG = $(AVX)_$(FPTYPE)_inl$(HELINL)_hrd$(HRDCOD)_$(RNDGEN) +override TAG = $(AVX)_$(FPTYPE)_inl$(HELINL)_hrd$(HRDCOD)_$(HASCURAND)_$(HASHIPRAND) # Build directory: current directory by default, or build.$(DIRTAG) if USEBUILDDIR==1 ###$(info Current directory is $(shell pwd)) diff --git a/epochX/cudacpp/gg_tt.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_tt.sa/src/mgOnGpuConfig.h index 475749ca7c..22d6921fda 100644 --- a/epochX/cudacpp/gg_tt.sa/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/gg_tt.sa/src/mgOnGpuConfig.h @@ -20,11 +20,15 @@ #undef MGONGPUCPP_GPUIMPL #endif +// Make sure that __HIP_PLATFORM_NVIDIA__ is undefined +// (__HIP_PLATFORM_AMD__ is defined by hipcc or in HiprandRandomNumberKernel.cc) +#undef __HIP_PLATFORM_NVIDIA__ // disable hiprand for NVidia (curand) + // ** NB1 Throughputs (e.g. 6.8E8) are events/sec for "./gcheck.exe -p 65536 128 12" // ** NB2 Baseline on b7g47n0004 fluctuates (probably depends on load on other VMs) // Choose if curand is supported for generating random numbers -// For HIP, by default, do not use curand (common random numbers will be used instead) +// For HIP, by default, do not allow curand to be used (hiprand or common random numbers will be used instead) // For both CUDA and C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND // (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784 and #785) #if defined __HIPCC__ @@ -39,6 +43,22 @@ //#endif #endif +// Choose if hiprand is supported for generating random numbers +// For CUDA, by default, do not allow hiprand to be used (curand or common random numbers will be used instead) +// For both HIP and C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_HIPRAND +// (there may exist HIP installations which do not include hiprand?) +#if defined __CUDACC__ +#define MGONGPU_HAS_NO_HIPRAND 1 +#else +//#ifdef __HIPCC__ +//#undef MGONGPU_HAS_NO_HIPRAND // default +////#define MGONGPU_HAS_NO_HIPRAND 1 +//#else +//#undef MGONGPU_HAS_NO_HIPRAND // default +////#define MGONGPU_HAS_NO_HIPRAND 1 +//#endif +#endif + // Choose floating point precision (for everything but color algebra #537) // If one of these macros has been set from outside with e.g. -DMGONGPU_FPTYPE_FLOAT, nothing happens (issue #167) #if not defined MGONGPU_FPTYPE_DOUBLE and not defined MGONGPU_FPTYPE_FLOAT diff --git a/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt b/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt index f321a2ba4c..86aeb0137d 100644 --- a/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt +++ b/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005854368209838867  +DEBUG: model prefixing takes 0.005301952362060547  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ WEIGHTED<=2 @1 INFO: Process has 3 diagrams -1 processes with 3 diagrams generated in 0.009 s +1 processes with 3 diagrams generated in 0.008 s Total: 1 processes with 3 diagrams add process g g > t t~ g INFO: Checking for minimal orders which gives processes. @@ -163,7 +163,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=3: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g WEIGHTED<=3 @2 INFO: Process has 16 diagrams -1 processes with 16 diagrams generated in 0.021 s +1 processes with 16 diagrams generated in 0.019 s Total: 2 processes with 19 diagrams output madevent ../TMPOUT/CODEGEN_mad_gg_tt01g --hel_recycling=False --vector_size=32 --me_exporter=standalone_cudacpp Load PLUGIN.CUDACPP_OUTPUT @@ -172,10 +172,10 @@ It has been validated for the last time with version: 3.5.2 Addition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT Output will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: cformat =  standalone_cudacpp [export_cpp.py at line 3071]  -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 160]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 161]  INFO: initialize a new directory: CODEGEN_mad_gg_tt01g INFO: remove old information in CODEGEN_mad_gg_tt01g -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 165]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 166]  WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g  INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards  @@ -187,7 +187,7 @@ INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1 INFO: Processing color information for process: g g > t t~ @1 INFO: Creating files in directory P2_gg_ttxg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1057]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -204,7 +204,7 @@ INFO: Generating Feynman diagrams for Process: g g > t t~ g WEIGHTED<=3 @2 INFO: Finding symmetric diagrams for subprocess group gg_ttxg INFO: Creating files in directory P1_gg_ttx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1057]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -219,23 +219,23 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1871]  INFO: Generating Feynman diagrams for Process: g g > t t~ WEIGHTED<=2 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttx -Generated helas calls for 2 subprocesses (19 diagrams) in 0.045 s -Wrote files for 46 helas calls in 0.254 s +Generated helas calls for 2 subprocesses (19 diagrams) in 0.042 s +Wrote files for 46 helas calls in 0.250 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 5 routines in 0.336 s -DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202]  +ALOHA: aloha creates 5 routines in 0.323 s +DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 203]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 10 routines in 0.319 s +ALOHA: aloha creates 10 routines in 0.307 s VVV1 VVV1 FFV1 @@ -278,16 +278,16 @@ Hunk #2 succeeded at 159 (offset 16 lines). Hunk #3 succeeded at 237 (offset 16 lines). Hunk #4 succeeded at 265 (offset 16 lines). Hunk #5 succeeded at 310 (offset 16 lines). -DEBUG: p.returncode =  0 [output.py at line 237]  +DEBUG: p.returncode =  0 [output.py at line 238]  Output to directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g done. Type "launch" to generate events from this process, or see /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/README Run "open index.html" to see more information about this process. quit -real 0m2.402s -user 0m2.110s -sys 0m0.272s +real 0m2.306s +user 0m2.029s +sys 0m0.243s Code generation completed in 2 seconds ************************************************************ * * diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/CurandRandomNumberKernel.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/CurandRandomNumberKernel.cc index 08a16f6f2c..c160c5e06b 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/CurandRandomNumberKernel.cc +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/CurandRandomNumberKernel.cc @@ -10,6 +10,7 @@ #include #ifndef MGONGPU_HAS_NO_CURAND /* clang-format off */ +// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when MGONGPUCPP_GPUIMPL is not defined #include "curand.h" #define checkCurand( code ){ assertCurand( code, __FILE__, __LINE__ ); } inline void assertCurand( curandStatus_t code, const char *file, int line, bool abort = true ) diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/HiprandRandomNumberKernel.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/HiprandRandomNumberKernel.cc new file mode 100644 index 0000000000..2e4534f9d4 --- /dev/null +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/HiprandRandomNumberKernel.cc @@ -0,0 +1,145 @@ +// Copyright (C) 2020-2024 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Jan 2024) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2024) for the MG5aMC CUDACPP plugin. + +#include "mgOnGpuConfig.h" + +#include "GpuRuntime.h" +#include "MemoryBuffers.h" +#include "RandomNumberKernels.h" + +#include + +#ifndef MGONGPU_HAS_NO_HIPRAND /* clang-format off */ +#ifndef __HIP_PLATFORM_AMD__ +#define __HIP_PLATFORM_AMD__ 1 // enable hiprand for AMD (rocrand) +#endif +#include +#define checkHiprand( code ){ assertHiprand( code, __FILE__, __LINE__ ); } +inline void assertHiprand( hiprandStatus_t code, const char *file, int line, bool abort = true ) +{ + if ( code != HIPRAND_STATUS_SUCCESS ) + { + printf( "HiprandAssert: %s:%d code=%d\n", file, line, code ); + if ( abort ) assert( code == HIPRAND_STATUS_SUCCESS ); + } +} +#endif /* clang-format on */ + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + //-------------------------------------------------------------------------- +#ifndef MGONGPU_HAS_NO_HIPRAND + HiprandRandomNumberKernel::HiprandRandomNumberKernel( BufferRndNumMomenta& rnarray, const bool onDevice ) + : RandomNumberKernelBase( rnarray ) + , m_isOnDevice( onDevice ) + { + if( m_isOnDevice ) + { +#ifdef MGONGPUCPP_GPUIMPL + if( !m_rnarray.isOnDevice() ) + throw std::runtime_error( "HiprandRandomNumberKernel on device with a host random number array" ); +#else + throw std::runtime_error( "HiprandRandomNumberKernel does not support HiprandDevice on CPU host" ); +#endif + } + else + { + if( m_rnarray.isOnDevice() ) + throw std::runtime_error( "HiprandRandomNumberKernel on host with a device random number array" ); + } + createGenerator(); + } + + //-------------------------------------------------------------------------- + + HiprandRandomNumberKernel::~HiprandRandomNumberKernel() + { + destroyGenerator(); + } + + //-------------------------------------------------------------------------- + + void HiprandRandomNumberKernel::seedGenerator( const unsigned int seed ) + { + if( m_isOnDevice ) + { + destroyGenerator(); // workaround for #429 + createGenerator(); // workaround for #429 + } + //printf( "seedGenerator: seed %d\n", seed ); + checkHiprand( hiprandSetPseudoRandomGeneratorSeed( m_rnGen, seed ) ); + } + + //-------------------------------------------------------------------------- + + void HiprandRandomNumberKernel::createGenerator() + { + //const hiprandRngType_t type = HIPRAND_RNG_PSEUDO_DEFAULT; + //const hiprandRngType_t type = HIPRAND_RNG_PSEUDO_XORWOW; + //const hiprandRngType_t type = HIPRAND_RNG_PSEUDO_MRG32K3A; + const hiprandRngType_t type = HIPRAND_RNG_PSEUDO_MTGP32; // same as curand; not implemented yet (code=1000) in host code + //const hiprandRngType_t type = HIPRAND_RNG_PSEUDO_MT19937; + //const hiprandRngType_t type = HIPRAND_RNG_PSEUDO_PHILOX4_32_10; + if( m_isOnDevice ) + { + checkHiprand( hiprandCreateGenerator( &m_rnGen, type ) ); + } + else + { + // See https://github.com/ROCm/hipRAND/issues/76 + throw std::runtime_error( "HiprandRandomNumberKernel on host is not supported yet (hiprandCreateGeneratorHost is not implemented yet)" ); + //checkHiprand( hiprandCreateGeneratorHost( &m_rnGen, type ) ); // ALWAYS FAILS WITH CODE=1000 + } + // FIXME: hiprand ordering is not implemented yet + // See https://github.com/ROCm/hipRAND/issues/75 + /* + //checkHiprand( hiprandSetGeneratorOrdering( *&m_rnGen, HIPRAND_ORDERING_PSEUDO_LEGACY ) ); + checkHiprand( hiprandSetGeneratorOrdering( *&m_rnGen, HIPRAND_ORDERING_PSEUDO_BEST ) ); + //checkHiprand( hiprandSetGeneratorOrdering( *&m_rnGen, HIPRAND_ORDERING_PSEUDO_DYNAMIC ) ); + //checkHiprand( hiprandSetGeneratorOrdering( *&m_rnGen, HIPRAND_ORDERING_PSEUDO_SEEDED ) ); + */ + } + + //-------------------------------------------------------------------------- + + void HiprandRandomNumberKernel::destroyGenerator() + { + checkHiprand( hiprandDestroyGenerator( m_rnGen ) ); + } + + //-------------------------------------------------------------------------- + + void HiprandRandomNumberKernel::generateRnarray() + { +#if defined MGONGPU_FPTYPE_DOUBLE + checkHiprand( hiprandGenerateUniformDouble( m_rnGen, m_rnarray.data(), m_rnarray.size() ) ); +#elif defined MGONGPU_FPTYPE_FLOAT + checkHiprand( hiprandGenerateUniform( m_rnGen, m_rnarray.data(), m_rnarray.size() ) ); +#endif + /* + printf( "\nHiprandRandomNumberKernel::generateRnarray size = %d\n", (int)m_rnarray.size() ); + fptype* data = m_rnarray.data(); +#ifdef MGONGPUCPP_GPUIMPL + if( m_rnarray.isOnDevice() ) + { + data = new fptype[m_rnarray.size()](); + checkCuda( cudaMemcpy( data, m_rnarray.data(), m_rnarray.bytes(), cudaMemcpyDeviceToHost ) ); + } +#endif + for( int i = 0; i < ( (int)m_rnarray.size() / 4 ); i++ ) + printf( "[%4d] %f %f %f %f\n", i * 4, data[i * 4], data[i * 4 + 2], data[i * 4 + 2], data[i * 4 + 3] ); +#ifdef MGONGPUCPP_GPUIMPL + if( m_rnarray.isOnDevice() ) delete[] data; +#endif + */ + } + + //-------------------------------------------------------------------------- +#endif +} diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/HiprandRandomNumberKernel.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/HiprandRandomNumberKernel.cc new file mode 120000 index 0000000000..6691864f78 --- /dev/null +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/HiprandRandomNumberKernel.cc @@ -0,0 +1 @@ +../HiprandRandomNumberKernel.cc \ No newline at end of file diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/check_sa.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/check_sa.cc index bde384c69e..e086ae12d9 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/check_sa.cc +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/check_sa.cc @@ -1,10 +1,10 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2023 CERN and UCLouvain. +// Copyright (C) 2020-2024 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -58,7 +58,7 @@ int usage( char* argv0, int ret = 1 ) { std::cout << "Usage: " << argv0 - << " [--verbose|-v] [--debug|-d] [--performance|-p] [--json|-j] [--curhst|--curdev|--common] [--rmbhst|--rmbdev] [--bridge]" + << " [--verbose|-v] [--debug|-d] [--performance|-p] [--json|-j] [--curhst|--curdev|--hirhst|--hirdev|--common] [--rmbhst|--rmbdev] [--bridge]" << " [#gpuBlocksPerGrid #gpuThreadsPerBlock] #iterations" << std::endl; std::cout << std::endl; std::cout << "The number of events per iteration is #gpuBlocksPerGrid * #gpuThreadsPerBlock" << std::endl; @@ -131,17 +131,31 @@ main( int argc, char** argv ) enum class RandomNumberMode { CommonRandom = 0, - CurandHost = 1, - CurandDevice = 2 + CurandHost = -1, + CurandDevice = 1, + HiprandHost = -2, + HiprandDevice = 2 }; -#ifdef MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785) -#elif defined __HIPCC__ -#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random -#elif defined __CUDACC__ +#if defined __CUDACC__ +#ifndef MGONGPU_HAS_NO_CURAND RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand #else + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on NVidia GPU if build has no curand (PR #784 and #785) +#endif +#elif defined __HIPCC__ +#ifndef MGONGPU_HAS_NO_HIPRAND + RandomNumberMode rndgen = RandomNumberMode::HiprandDevice; // default on AMD GPU if build has hiprand +#else + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on AMD GPU if build has no hiprand +#endif +#else +#ifndef MGONGPU_HAS_NO_CURAND RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand +#elif not defined MGONGPU_HAS_NO_HIPRAND + RandomNumberMode rndgen = RandomNumberMode::HiprandDevice; // default on CPU if build has hiprand +#else + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has neither curand nor hiprand +#endif #endif // Rambo sampling mode (NB RamboHost implies CommonRandom or CurandHost!) enum class RamboSamplingMode @@ -152,7 +166,7 @@ main( int argc, char** argv ) #ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -193,6 +207,26 @@ main( int argc, char** argv ) throw std::runtime_error( "CurandHost is not supported because this application was built without Curand support" ); #else rndgen = RandomNumberMode::CurandHost; +#endif + } + else if( arg == "--hirdev" ) + { +#ifndef __HIPCC__ + throw std::runtime_error( "HiprandDevice is not supported on CPUs or non-AMD GPUs" ); +#elif defined MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "HiprandDevice is not supported because this application was built without Hiprand support" ); +#else + rndgen = RandomNumberMode::HiprandDevice; +#endif + } + else if( arg == "--hirhst" ) + { +#ifdef MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "HiprandHost is not supported because this application was built without Hiprand support" ); +#else + // See https://github.com/ROCm/hipRAND/issues/76 + throw std::runtime_error( "HiprandRandomNumberKernel on host is not supported yet (hiprandCreateGeneratorHost is not implemented yet)" ); + //rndgen = RandomNumberMode::HiprandHost; #endif } else if( arg == "--common" ) @@ -265,6 +299,20 @@ main( int argc, char** argv ) #endif } + if( rmbsmp == RamboSamplingMode::RamboHost && rndgen == RandomNumberMode::HiprandDevice ) + { +#if not defined MGONGPU_HAS_NO_HIPRAND + // See https://github.com/ROCm/hipRAND/issues/76 + //std::cout << "WARNING! RamboHost selected: cannot use HiprandDevice, will use HiprandHost" << std::endl; + //rndgen = RandomNumberMode::HiprandHost; + std::cout << "WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet)" << std::endl; + rndgen = RandomNumberMode::CommonRandom; +#else + std::cout << "WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom" << std::endl; + rndgen = RandomNumberMode::CommonRandom; +#endif + } + constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout constexpr int neppR = MemoryAccessRandomNumbers::neppR; // AOSOA layout @@ -415,7 +463,7 @@ main( int argc, char** argv ) std::unique_ptr wavetimes( new double[niter] ); std::unique_ptr wv3atimes( new double[niter] ); - // --- 0c. Create curand or common generator + // --- 0c. Create curand, hiprand or common generator const std::string cgenKey = "0c GenCreat"; timermap.start( cgenKey ); // Allocate the appropriate RandomNumberKernel @@ -433,7 +481,7 @@ main( int argc, char** argv ) prnk.reset( new CurandRandomNumberKernel( hstRndmom, onDevice ) ); #endif } - else + else if( rndgen == RandomNumberMode::CurandDevice ) { #ifdef MGONGPU_HAS_NO_CURAND throw std::runtime_error( "INTERNAL ERROR! CurandDevice is not supported because this application was built without Curand support" ); // INTERNAL ERROR (no path to this statement) @@ -441,9 +489,31 @@ main( int argc, char** argv ) const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); #else - throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif } + else if( rndgen == RandomNumberMode::HiprandHost ) + { +#ifdef MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "INTERNAL ERROR! HiprandHost is not supported because this application was built without Hiprand support" ); // INTERNAL ERROR (no path to this statement) +#else + const bool onDevice = false; + prnk.reset( new HiprandRandomNumberKernel( hstRndmom, onDevice ) ); +#endif + } + else if( rndgen == RandomNumberMode::HiprandDevice ) + { +#ifdef MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "INTERNAL ERROR! HiprandDevice is not supported because this application was built without Hiprand support" ); // INTERNAL ERROR (no path to this statement) +#elif defined __HIPCC__ + const bool onDevice = true; + prnk.reset( new HiprandRandomNumberKernel( devRndmom, onDevice ) ); +#else + throw std::logic_error( "INTERNAL ERROR! HiprandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) +#endif + } + else + throw std::logic_error( "INTERNAL ERROR! Unknown rndgen value?" ); // INTERNAL ERROR (no path to this statement) // --- 0c. Create rambo sampling kernel [keep this in 0c for the moment] std::unique_ptr prsk; @@ -497,7 +567,7 @@ main( int argc, char** argv ) // *** START THE OLD-STYLE TIMER FOR RANDOM GEN *** double genrtime = 0; - // --- 1a. Seed rnd generator (to get same results on host and device in curand) + // --- 1a. Seed rnd generator (to get same results on host and device in curand/hiprand) // [NB This should not be necessary using the host API: "Generation functions // can be called multiple times on the same generator to generate successive // blocks of results. For pseudorandom generators, multiple calls to generation @@ -515,7 +585,9 @@ main( int argc, char** argv ) //std::cout << "Got random numbers" << std::endl; #ifdef MGONGPUCPP_GPUIMPL - if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) + if( rndgen != RandomNumberMode::CurandDevice && + rndgen != RandomNumberMode::HiprandDevice && + rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device const std::string htodKey = "1c CpHTDrnd"; @@ -761,6 +833,10 @@ main( int argc, char** argv ) rndgentxt = "CURAND HOST"; else if( rndgen == RandomNumberMode::CurandDevice ) rndgentxt = "CURAND DEVICE"; + else if( rndgen == RandomNumberMode::HiprandHost ) + rndgentxt = "ROCRAND HOST"; + else if( rndgen == RandomNumberMode::HiprandDevice ) + rndgentxt = "ROCRAND DEVICE"; #ifdef __CUDACC__ rndgentxt += " (CUDA code)"; #elif defined __HIPCC__ @@ -788,7 +864,7 @@ main( int argc, char** argv ) wrkflwtxt += "FLT+"; #else wrkflwtxt += "???+"; // no path to this statement -#endif /* clang-format on */ +#endif // -- CUCOMPLEX or THRUST or STD or CXSIMPLE complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -813,7 +889,7 @@ main( int argc, char** argv ) wrkflwtxt += "CXS:"; #else wrkflwtxt += "???:"; // no path to this statement -#endif +#endif /* clang-format on */ #endif // -- COMMON or CURAND HOST or CURAND DEVICE random numbers? if( rndgen == RandomNumberMode::CommonRandom ) @@ -822,6 +898,10 @@ main( int argc, char** argv ) wrkflwtxt += "CURHST+"; else if( rndgen == RandomNumberMode::CurandDevice ) wrkflwtxt += "CURDEV+"; + else if( rndgen == RandomNumberMode::HiprandHost ) + wrkflwtxt += "HIRHST+"; + else if( rndgen == RandomNumberMode::HiprandDevice ) + wrkflwtxt += "HIRDEV+"; else wrkflwtxt += "??????+"; // no path to this statement // -- HOST or DEVICE rambo sampling? @@ -1099,7 +1179,7 @@ main( int argc, char** argv ) #ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif - << "\"Curand generation\": " + << "\"Random generation\": " << "\"" << rndgentxt << "\"," << std::endl; double minelem = hstStats.minME; diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/HiprandRandomNumberKernel.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/HiprandRandomNumberKernel.cc new file mode 120000 index 0000000000..6691864f78 --- /dev/null +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/HiprandRandomNumberKernel.cc @@ -0,0 +1 @@ +../HiprandRandomNumberKernel.cc \ No newline at end of file diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/check_sa.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/check_sa.cc index bde384c69e..e086ae12d9 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/check_sa.cc +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/check_sa.cc @@ -1,10 +1,10 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2023 CERN and UCLouvain. +// Copyright (C) 2020-2024 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -58,7 +58,7 @@ int usage( char* argv0, int ret = 1 ) { std::cout << "Usage: " << argv0 - << " [--verbose|-v] [--debug|-d] [--performance|-p] [--json|-j] [--curhst|--curdev|--common] [--rmbhst|--rmbdev] [--bridge]" + << " [--verbose|-v] [--debug|-d] [--performance|-p] [--json|-j] [--curhst|--curdev|--hirhst|--hirdev|--common] [--rmbhst|--rmbdev] [--bridge]" << " [#gpuBlocksPerGrid #gpuThreadsPerBlock] #iterations" << std::endl; std::cout << std::endl; std::cout << "The number of events per iteration is #gpuBlocksPerGrid * #gpuThreadsPerBlock" << std::endl; @@ -131,17 +131,31 @@ main( int argc, char** argv ) enum class RandomNumberMode { CommonRandom = 0, - CurandHost = 1, - CurandDevice = 2 + CurandHost = -1, + CurandDevice = 1, + HiprandHost = -2, + HiprandDevice = 2 }; -#ifdef MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785) -#elif defined __HIPCC__ -#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random -#elif defined __CUDACC__ +#if defined __CUDACC__ +#ifndef MGONGPU_HAS_NO_CURAND RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand #else + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on NVidia GPU if build has no curand (PR #784 and #785) +#endif +#elif defined __HIPCC__ +#ifndef MGONGPU_HAS_NO_HIPRAND + RandomNumberMode rndgen = RandomNumberMode::HiprandDevice; // default on AMD GPU if build has hiprand +#else + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on AMD GPU if build has no hiprand +#endif +#else +#ifndef MGONGPU_HAS_NO_CURAND RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand +#elif not defined MGONGPU_HAS_NO_HIPRAND + RandomNumberMode rndgen = RandomNumberMode::HiprandDevice; // default on CPU if build has hiprand +#else + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has neither curand nor hiprand +#endif #endif // Rambo sampling mode (NB RamboHost implies CommonRandom or CurandHost!) enum class RamboSamplingMode @@ -152,7 +166,7 @@ main( int argc, char** argv ) #ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -193,6 +207,26 @@ main( int argc, char** argv ) throw std::runtime_error( "CurandHost is not supported because this application was built without Curand support" ); #else rndgen = RandomNumberMode::CurandHost; +#endif + } + else if( arg == "--hirdev" ) + { +#ifndef __HIPCC__ + throw std::runtime_error( "HiprandDevice is not supported on CPUs or non-AMD GPUs" ); +#elif defined MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "HiprandDevice is not supported because this application was built without Hiprand support" ); +#else + rndgen = RandomNumberMode::HiprandDevice; +#endif + } + else if( arg == "--hirhst" ) + { +#ifdef MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "HiprandHost is not supported because this application was built without Hiprand support" ); +#else + // See https://github.com/ROCm/hipRAND/issues/76 + throw std::runtime_error( "HiprandRandomNumberKernel on host is not supported yet (hiprandCreateGeneratorHost is not implemented yet)" ); + //rndgen = RandomNumberMode::HiprandHost; #endif } else if( arg == "--common" ) @@ -265,6 +299,20 @@ main( int argc, char** argv ) #endif } + if( rmbsmp == RamboSamplingMode::RamboHost && rndgen == RandomNumberMode::HiprandDevice ) + { +#if not defined MGONGPU_HAS_NO_HIPRAND + // See https://github.com/ROCm/hipRAND/issues/76 + //std::cout << "WARNING! RamboHost selected: cannot use HiprandDevice, will use HiprandHost" << std::endl; + //rndgen = RandomNumberMode::HiprandHost; + std::cout << "WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet)" << std::endl; + rndgen = RandomNumberMode::CommonRandom; +#else + std::cout << "WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom" << std::endl; + rndgen = RandomNumberMode::CommonRandom; +#endif + } + constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout constexpr int neppR = MemoryAccessRandomNumbers::neppR; // AOSOA layout @@ -415,7 +463,7 @@ main( int argc, char** argv ) std::unique_ptr wavetimes( new double[niter] ); std::unique_ptr wv3atimes( new double[niter] ); - // --- 0c. Create curand or common generator + // --- 0c. Create curand, hiprand or common generator const std::string cgenKey = "0c GenCreat"; timermap.start( cgenKey ); // Allocate the appropriate RandomNumberKernel @@ -433,7 +481,7 @@ main( int argc, char** argv ) prnk.reset( new CurandRandomNumberKernel( hstRndmom, onDevice ) ); #endif } - else + else if( rndgen == RandomNumberMode::CurandDevice ) { #ifdef MGONGPU_HAS_NO_CURAND throw std::runtime_error( "INTERNAL ERROR! CurandDevice is not supported because this application was built without Curand support" ); // INTERNAL ERROR (no path to this statement) @@ -441,9 +489,31 @@ main( int argc, char** argv ) const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); #else - throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif } + else if( rndgen == RandomNumberMode::HiprandHost ) + { +#ifdef MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "INTERNAL ERROR! HiprandHost is not supported because this application was built without Hiprand support" ); // INTERNAL ERROR (no path to this statement) +#else + const bool onDevice = false; + prnk.reset( new HiprandRandomNumberKernel( hstRndmom, onDevice ) ); +#endif + } + else if( rndgen == RandomNumberMode::HiprandDevice ) + { +#ifdef MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "INTERNAL ERROR! HiprandDevice is not supported because this application was built without Hiprand support" ); // INTERNAL ERROR (no path to this statement) +#elif defined __HIPCC__ + const bool onDevice = true; + prnk.reset( new HiprandRandomNumberKernel( devRndmom, onDevice ) ); +#else + throw std::logic_error( "INTERNAL ERROR! HiprandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) +#endif + } + else + throw std::logic_error( "INTERNAL ERROR! Unknown rndgen value?" ); // INTERNAL ERROR (no path to this statement) // --- 0c. Create rambo sampling kernel [keep this in 0c for the moment] std::unique_ptr prsk; @@ -497,7 +567,7 @@ main( int argc, char** argv ) // *** START THE OLD-STYLE TIMER FOR RANDOM GEN *** double genrtime = 0; - // --- 1a. Seed rnd generator (to get same results on host and device in curand) + // --- 1a. Seed rnd generator (to get same results on host and device in curand/hiprand) // [NB This should not be necessary using the host API: "Generation functions // can be called multiple times on the same generator to generate successive // blocks of results. For pseudorandom generators, multiple calls to generation @@ -515,7 +585,9 @@ main( int argc, char** argv ) //std::cout << "Got random numbers" << std::endl; #ifdef MGONGPUCPP_GPUIMPL - if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) + if( rndgen != RandomNumberMode::CurandDevice && + rndgen != RandomNumberMode::HiprandDevice && + rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device const std::string htodKey = "1c CpHTDrnd"; @@ -761,6 +833,10 @@ main( int argc, char** argv ) rndgentxt = "CURAND HOST"; else if( rndgen == RandomNumberMode::CurandDevice ) rndgentxt = "CURAND DEVICE"; + else if( rndgen == RandomNumberMode::HiprandHost ) + rndgentxt = "ROCRAND HOST"; + else if( rndgen == RandomNumberMode::HiprandDevice ) + rndgentxt = "ROCRAND DEVICE"; #ifdef __CUDACC__ rndgentxt += " (CUDA code)"; #elif defined __HIPCC__ @@ -788,7 +864,7 @@ main( int argc, char** argv ) wrkflwtxt += "FLT+"; #else wrkflwtxt += "???+"; // no path to this statement -#endif /* clang-format on */ +#endif // -- CUCOMPLEX or THRUST or STD or CXSIMPLE complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -813,7 +889,7 @@ main( int argc, char** argv ) wrkflwtxt += "CXS:"; #else wrkflwtxt += "???:"; // no path to this statement -#endif +#endif /* clang-format on */ #endif // -- COMMON or CURAND HOST or CURAND DEVICE random numbers? if( rndgen == RandomNumberMode::CommonRandom ) @@ -822,6 +898,10 @@ main( int argc, char** argv ) wrkflwtxt += "CURHST+"; else if( rndgen == RandomNumberMode::CurandDevice ) wrkflwtxt += "CURDEV+"; + else if( rndgen == RandomNumberMode::HiprandHost ) + wrkflwtxt += "HIRHST+"; + else if( rndgen == RandomNumberMode::HiprandDevice ) + wrkflwtxt += "HIRDEV+"; else wrkflwtxt += "??????+"; // no path to this statement // -- HOST or DEVICE rambo sampling? @@ -1099,7 +1179,7 @@ main( int argc, char** argv ) #ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif - << "\"Curand generation\": " + << "\"Random generation\": " << "\"" << rndgentxt << "\"," << std::endl; double minelem = hstStats.minME; diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/RandomNumberKernels.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/RandomNumberKernels.h index 21d63beeac..7ed728a26c 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/RandomNumberKernels.h +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/RandomNumberKernels.h @@ -1,21 +1,22 @@ -// Copyright (C) 2020-2023 CERN and UCLouvain. +// Copyright (C) 2020-2024 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. #ifndef RANDOMNUMBERKERNELS_H #define RANDOMNUMBERKERNELS_H 1 #include "mgOnGpuConfig.h" -// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when MGONGPUCPP_GPUIMPL is not defined -#ifndef MGONGPU_HAS_NO_CURAND -//#include "curand.h" -struct curandGenerator_st; // forward definition from curand.h -#endif - #include "MemoryBuffers.h" +// Forward definition from curand.h (the full header is only needed in CurandRandomKernel.cc) +struct curandGenerator_st; + +// Forward definition from hiprand.h (the full header is only needed in HiprandRandomKernel.cc) +struct rocrand_generator_base_type; +typedef rocrand_generator_base_type hiprandGenerator_st; + #ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else @@ -107,7 +108,6 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef MGONGPU_HAS_NO_CURAND // A class encapsulating CURAND random number generation on a CPU host or on a GPU device class CurandRandomNumberKernel final : public RandomNumberKernelBase { @@ -142,11 +142,49 @@ namespace mg5amcCpu const bool m_isOnDevice; // The curand generator - // (NB: curand.h defines typedef generator_t as a pointer to forward-defined 'struct curandGenerator_st') + // (NB: curand.h defines typedef curandGenerator_t as a pointer to forward-defined 'struct curandGenerator_st') curandGenerator_st* m_rnGen; }; -#endif + //-------------------------------------------------------------------------- + + // A class encapsulating HIPRAND random number generation on a CPU host or on a GPU device + class HiprandRandomNumberKernel final : public RandomNumberKernelBase + { + public: + + // Constructor from an existing output buffer + HiprandRandomNumberKernel( BufferRndNumMomenta& rnarray, const bool onDevice ); + + // Destructor + ~HiprandRandomNumberKernel(); + + // Seed the random number generator + void seedGenerator( const unsigned int seed ) override final; + + // Generate the random number array + void generateRnarray() override final; + + // Is this a host or device kernel? + bool isOnDevice() const override final { return m_isOnDevice; } + + private: + + // Create the generator (workaround for #429: do this in every seedGenerator call rather than only in the ctor) + void createGenerator(); + + // Destroy the generator (workaround for #429: do this in every seedGenerator call rather than only in the ctor) + void destroyGenerator(); + + private: + + // Is this a host or device kernel? + const bool m_isOnDevice; + + // The hiprand generator + // (NB: hiprand.h defines typedef hiprandGenerator_t as a pointer to forward-defined 'struct hiprandGenerator_st') + hiprandGenerator_st* m_rnGen; + }; //-------------------------------------------------------------------------- } diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp.mk index 1077bdc098..3ad91dfd59 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp.mk @@ -173,11 +173,6 @@ ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) comma:=, CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch)) CUINC = -I$(CUDA_HOME)/include/ - ifeq ($(RNDGEN),hasNoCurand) - CURANDLIBFLAGS= - else - CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here! - endif CUOPTFLAGS = -lineinfo ###GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math GPUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math @@ -241,7 +236,7 @@ else override GPUCC= override USE_NVTX= override CUINC= - override CURANDLIBFLAGS= + override HIPINC= endif @@ -291,7 +286,7 @@ endif #------------------------------------------------------------------------------- -#=== Configure defaults and check if user-defined choices exist for OMPFLAGS, AVX, FPTYPE, HELINL, HRDCOD, RNDGEN +#=== Configure defaults and check if user-defined choices exist for OMPFLAGS, AVX, FPTYPE, HELINL, HRDCOD # Set the default OMPFLAGS choice ifneq ($(findstring hipcc,$(GPUCC)),) @@ -352,29 +347,62 @@ ifeq ($(HRDCOD),) override HRDCOD = 0 endif -# Set the default RNDGEN (random number generator) choice -ifeq ($(RNDGEN),) - ifeq ($(GPUCC),) - override RNDGEN = hasNoCurand - # Edgecase for HIP compilation - else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) - override RNDGEN = hasNoCurand - else ifeq ($(RNDGEN),) - override RNDGEN = hasCurand - endif -endif - -# Export AVX, FPTYPE, HELINL, HRDCOD, RNDGEN, OMPFLAGS so that it is not necessary to pass them to the src Makefile too +# Export AVX, FPTYPE, HELINL, HRDCOD, OMPFLAGS so that it is not necessary to pass them to the src Makefile too export AVX export FPTYPE export HELINL export HRDCOD -export RNDGEN export OMPFLAGS #------------------------------------------------------------------------------- -#=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN +#=== Configure defaults and check if user-defined choices exist for RNDGEN (legacy!), HASCURAND, HASHIPRAND + +# If the legacy RNDGEN exists, this take precedence over any HASCURAND choice (but a warning is printed out) +###$(info RNDGEN=$(RNDGEN)) +ifneq ($(RNDGEN),) + $(warning Environment variable RNDGEN is no longer supported, please use HASCURAND instead!) + ifeq ($(RNDGEN),hasCurand) + override HASCURAND = $(RNDGEN) + else ifeq ($(RNDGEN),hasNoCurand) + override HASCURAND = $(RNDGEN) + else ifneq ($(RNDGEN),hasNoCurand) + $(error Unknown RNDGEN='$(RNDGEN)': only 'hasCurand' and 'hasNoCurand' are supported - but use HASCURAND instead!) + endif +endif + +# Set the default HASCURAND (curand random number generator) choice, if no prior choice exists for HASCURAND +# (NB: allow HASCURAND=hasCurand even if $(GPUCC) does not point to nvcc: assume CUDA_HOME was defined correctly...) +ifeq ($(HASCURAND),) + ifeq ($(GPUCC),) # CPU-only build + override HASCURAND = hasNoCurand + else ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build + override HASCURAND = hasCurand + else # non-Nvidia GPU build + override HASCURAND = hasNoCurand + endif +endif + +# Set the default HASHIPRAND (hiprand random number generator) choice, if no prior choice exists for HASHIPRAND +# (NB: allow HASHIPRAND=hasHiprand even if $(GPUCC) does not point to hipcc: assume HIP_HOME was defined correctly...) +ifeq ($(HASHIPRAND),) + ifeq ($(GPUCC),) # CPU-only build + override HASHIPRAND = hasNoHiprand + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build + override HASHIPRAND = hasHiprand + else # non-AMD GPU build + override HASHIPRAND = hasNoHiprand + endif +endif + +# Export HASCURAND, HASHIPRAND so that it is not necessary to pass them to the src Makefile too +# (NB: these variables in cudacpp_src.mk are only used to define the build tag, they are NOT needed for RNDCXXFLAGS or RNDLIBFLAGS) +export HASCURAND +export HASHIPRAND + +#------------------------------------------------------------------------------- + +#=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD # Set the build flags appropriate to OMPFLAGS $(info OMPFLAGS=$(OMPFLAGS)) @@ -432,13 +460,13 @@ CXXFLAGS+= $(AVXFLAGS) $(info FPTYPE=$(FPTYPE)) ifeq ($(FPTYPE),d) CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE - GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE + GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE else ifeq ($(FPTYPE),f) CXXFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT - GPUFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT + GPUFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT else ifeq ($(FPTYPE),m) CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT - GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT + GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT else $(error Unknown FPTYPE='$(FPTYPE)': only 'd', 'f' and 'm' are supported) endif @@ -447,7 +475,7 @@ endif $(info HELINL=$(HELINL)) ifeq ($(HELINL),1) CXXFLAGS += -DMGONGPU_INLINE_HELAMPS - GPUFLAGS += -DMGONGPU_INLINE_HELAMPS + GPUFLAGS += -DMGONGPU_INLINE_HELAMPS else ifneq ($(HELINL),0) $(error Unknown HELINL='$(HELINL)': only '0' and '1' are supported) endif @@ -456,21 +484,40 @@ endif $(info HRDCOD=$(HRDCOD)) ifeq ($(HRDCOD),1) CXXFLAGS += -DMGONGPU_HARDCODE_PARAM - GPUFLAGS += -DMGONGPU_HARDCODE_PARAM + GPUFLAGS += -DMGONGPU_HARDCODE_PARAM else ifneq ($(HRDCOD),0) $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported) endif -# Set the build flags appropriate to each RNDGEN choice (example: "make RNDGEN=hasNoCurand") -$(info RNDGEN=$(RNDGEN)) -ifeq ($(RNDGEN),hasNoCurand) - override CXXFLAGSCURAND = -DMGONGPU_HAS_NO_CURAND -else ifeq ($(RNDGEN),hasCurand) - override CXXFLAGSCURAND = + +#=== Set the CUDA/HIP/C++ compiler and linker flags appropriate to user-defined choices of HASCURAND, HASHIPRAND + +$(info HASCURAND=$(HASCURAND)) +$(info HASHIPRAND=$(HASHIPRAND)) +override RNDCXXFLAGS= +override RNDLIBFLAGS= + +# Set the RNDCXXFLAGS and RNDLIBFLAGS build flags appropriate to each HASCURAND choice (example: "make HASCURAND=hasNoCurand") +ifeq ($(HASCURAND),hasNoCurand) + override RNDCXXFLAGS += -DMGONGPU_HAS_NO_CURAND +else ifeq ($(HASCURAND),hasCurand) + override RNDLIBFLAGS += -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here! else - $(error Unknown RNDGEN='$(RNDGEN)': only 'hasCurand' and 'hasNoCurand' are supported) + $(error Unknown HASCURAND='$(HASCURAND)': only 'hasCurand' and 'hasNoCurand' are supported) +endif + +# Set the RNDCXXFLAGS and RNDLIBFLAGS build flags appropriate to each HASHIPRAND choice (example: "make HASHIPRAND=hasNoHiprand") +ifeq ($(HASHIPRAND),hasNoHiprand) + override RNDCXXFLAGS += -DMGONGPU_HAS_NO_HIPRAND +else ifeq ($(HASHIPRAND),hasHiprand) + override RNDLIBFLAGS += -L$(HIP_HOME)/lib/ -lhiprand +else ifneq ($(HASHIPRAND),hasHiprand) + $(error Unknown HASHIPRAND='$(HASHIPRAND)': only 'hasHiprand' and 'hasNoHiprand' are supported) endif +#$(info RNDCXXFLAGS=$(RNDCXXFLAGS)) +#$(info HASHIPRAND=$(HASHIPRAND)) + #------------------------------------------------------------------------------- #=== Configure build directories and build lockfiles === @@ -481,7 +528,7 @@ override DIRTAG = $(AVX)_$(FPTYPE)_inl$(HELINL)_hrd$(HRDCOD) # Build lockfile "full" tag (defines full specification of build options that cannot be intermixed) # (Rationale: avoid mixing of CUDA and no-CUDA environment builds with different random number generators) -override TAG = $(AVX)_$(FPTYPE)_inl$(HELINL)_hrd$(HRDCOD)_$(RNDGEN) +override TAG = $(AVX)_$(FPTYPE)_inl$(HELINL)_hrd$(HRDCOD)_$(HASCURAND)_$(HASHIPRAND) # Build directory: current directory by default, or build.$(DIRTAG) if USEBUILDDIR==1 ifeq ($(USEBUILDDIR),1) @@ -589,14 +636,19 @@ endif $(BUILDDIR)/check_sa.o: CXXFLAGS += $(USE_NVTX) $(CUINC) $(BUILDDIR)/check_sa_cu.o: CXXFLAGS += $(USE_NVTX) $(CUINC) -# Apply special build flags only to check_sa[_cu].o and CurandRandomNumberKernel[_cu].o (curand headers, #679) -$(BUILDDIR)/check_sa.o: CXXFLAGS += $(CXXFLAGSCURAND) -$(BUILDDIR)/check_sa_cu.o: CUFLAGS += $(CXXFLAGSCURAND) -$(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CXXFLAGSCURAND) -$(BUILDDIR)/CurandRandomNumberKernel_cu.o: CUFLAGS += $(CXXFLAGSCURAND) -ifeq ($(RNDGEN),hasCurand) +# Apply special build flags only to check_sa[_cu].o and (Cu|Hip)randRandomNumberKernel[_cu].o +$(BUILDDIR)/check_sa.o: CXXFLAGS += $(RNDCXXFLAGS) +$(BUILDDIR)/check_sa_cu.o: CUFLAGS += $(RNDCXXFLAGS) +$(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(RNDCXXFLAGS) +$(BUILDDIR)/CurandRandomNumberKernel_cu.o: CUFLAGS += $(RNDCXXFLAGS) +$(BUILDDIR)/HiprandRandomNumberKernel.o: CXXFLAGS += $(RNDCXXFLAGS) +$(BUILDDIR)/HiprandRandomNumberKernel_cu.o: CUFLAGS += $(RNDCXXFLAGS) +ifeq ($(HASCURAND),hasCurand) # curand headers, #679 $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUINC) endif +ifeq ($(HASHIPRAND),hasHiprand) # hiprand headers +$(BUILDDIR)/HiprandRandomNumberKernel.o: CXXFLAGS += $(HIPINC) +endif # Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592) ifneq ($(shell $(CXX) --version | egrep '^(Intel)'),) @@ -673,8 +725,8 @@ endif # Target (and build rules): C++ and CUDA standalone executables $(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o - $(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CURANDLIBFLAGS) +$(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(BUILDDIR)/HiprandRandomNumberKernel.o + $(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(BUILDDIR)/HiprandRandomNumberKernel.o $(RNDLIBFLAGS) ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) @@ -684,8 +736,8 @@ else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(cu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif $(cu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(cu_main): $(BUILDDIR)/check_sa_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_cu.o - $(GPUCC) -o $@ $(BUILDDIR)/check_sa_cu.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_cu.o $(CURANDLIBFLAGS) +$(cu_main): $(BUILDDIR)/check_sa_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_cu.o $(BUILDDIR)/HiprandRandomNumberKernel_cu.o + $(GPUCC) -o $@ $(BUILDDIR)/check_sa_cu.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_cu.o $(BUILDDIR)/HiprandRandomNumberKernel_cu.o $(RNDLIBFLAGS) endif #------------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/runTest.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/runTest.cc index de327f2321..7f248d29a4 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/runTest.cc +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/runTest.cc @@ -238,7 +238,7 @@ struct CUDATest : public CUDA_CPU_TestBase return MemoryAccessMatrixElements::ieventAccessConst( hstMatrixElements.data(), ievt ); } }; -#endif +#endif /* clang-format off */ // Use two levels of macros to force stringification at the right level // (see https://gcc.gnu.org/onlinedocs/gcc-3.0.1/cpp_3.html#SEC17 and https://stackoverflow.com/a/3419392) @@ -260,4 +260,4 @@ INSTANTIATE_TEST_SUITE_P( prefix, \ MG_INSTANTIATE_TEST_SUITE_GPU( XTESTID_GPU( MG_EPOCH_PROCESS_ID ), MadgraphTest ); #else MG_INSTANTIATE_TEST_SUITE_CPU( XTESTID_CPU( MG_EPOCH_PROCESS_ID ), MadgraphTest ); -#endif +#endif /* clang-format on */ diff --git a/epochX/cudacpp/gg_tt01g.mad/src/cudacpp_src.mk b/epochX/cudacpp/gg_tt01g.mad/src/cudacpp_src.mk index b2b9da5288..d7d51259d2 100644 --- a/epochX/cudacpp/gg_tt01g.mad/src/cudacpp_src.mk +++ b/epochX/cudacpp/gg_tt01g.mad/src/cudacpp_src.mk @@ -45,13 +45,13 @@ endif #------------------------------------------------------------------------------- -#=== Configure the CUDA compiler (note: GPUCC is already exported including ccache) +#=== Configure the CUDA compiler (note: GPUCC have been exported from cudacpp.mk including ccache) ###$(info GPUCC=$(GPUCC)) #------------------------------------------------------------------------------- -#=== Configure ccache for C++ builds (note: GPUCC is already exported including ccache) +#=== Configure ccache for C++ builds (note: GPUCC have been exported from cudacpp.mk including ccache) # Enable ccache if USECCACHE=1 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) @@ -86,7 +86,8 @@ endif #------------------------------------------------------------------------------- -#=== Set the CUDA/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN +#=== Set the CUDA/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD (exported from cudacpp.mk) +#=== (NB the RNDCXXFLAGS and RNDLIBFLAGS appropriate to user-defined choices of HASCURAND and HASHIPRAND have been exported from cudacpp.mk) # Set the build flags appropriate to OMPFLAGS ###$(info OMPFLAGS=$(OMPFLAGS)) @@ -175,14 +176,6 @@ else ifneq ($(HRDCOD),0) $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported) endif -# Set the build flags appropriate to each RNDGEN choice (example: "make RNDGEN=hasNoCurand") -###$(info RNDGEN=$(RNDGEN)) -ifeq ($(RNDGEN),hasNoCurand) - CXXFLAGS += -DMGONGPU_HAS_NO_CURAND -else ifneq ($(RNDGEN),hasCurand) - $(error Unknown RNDGEN='$(RNDGEN)': only 'hasCurand' and 'hasNoCurand' are supported) -endif - #------------------------------------------------------------------------------- #=== Configure build directories and build lockfiles === @@ -193,7 +186,7 @@ override DIRTAG = $(AVX)_$(FPTYPE)_inl$(HELINL)_hrd$(HRDCOD) # Build lockfile "full" tag (defines full specification of build options that cannot be intermixed) # (Rationale: avoid mixing of CUDA and no-CUDA environment builds with different random number generators) -override TAG = $(AVX)_$(FPTYPE)_inl$(HELINL)_hrd$(HRDCOD)_$(RNDGEN) +override TAG = $(AVX)_$(FPTYPE)_inl$(HELINL)_hrd$(HRDCOD)_$(HASCURAND)_$(HASHIPRAND) # Build directory: current directory by default, or build.$(DIRTAG) if USEBUILDDIR==1 ###$(info Current directory is $(shell pwd)) diff --git a/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuConfig.h index 6bde4466d0..33306bc3e2 100644 --- a/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuConfig.h @@ -20,11 +20,15 @@ #undef MGONGPUCPP_GPUIMPL #endif +// Make sure that __HIP_PLATFORM_NVIDIA__ is undefined +// (__HIP_PLATFORM_AMD__ is defined by hipcc or in HiprandRandomNumberKernel.cc) +#undef __HIP_PLATFORM_NVIDIA__ // disable hiprand for NVidia (curand) + // ** NB1 Throughputs (e.g. 6.8E8) are events/sec for "./gcheck.exe -p 65536 128 12" // ** NB2 Baseline on b7g47n0004 fluctuates (probably depends on load on other VMs) // Choose if curand is supported for generating random numbers -// For HIP, by default, do not use curand (common random numbers will be used instead) +// For HIP, by default, do not allow curand to be used (hiprand or common random numbers will be used instead) // For both CUDA and C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND // (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784 and #785) #if defined __HIPCC__ @@ -39,6 +43,22 @@ //#endif #endif +// Choose if hiprand is supported for generating random numbers +// For CUDA, by default, do not allow hiprand to be used (curand or common random numbers will be used instead) +// For both HIP and C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_HIPRAND +// (there may exist HIP installations which do not include hiprand?) +#if defined __CUDACC__ +#define MGONGPU_HAS_NO_HIPRAND 1 +#else +//#ifdef __HIPCC__ +//#undef MGONGPU_HAS_NO_HIPRAND // default +////#define MGONGPU_HAS_NO_HIPRAND 1 +//#else +//#undef MGONGPU_HAS_NO_HIPRAND // default +////#define MGONGPU_HAS_NO_HIPRAND 1 +//#endif +#endif + // Choose floating point precision (for everything but color algebra #537) // If one of these macros has been set from outside with e.g. -DMGONGPU_FPTYPE_FLOAT, nothing happens (issue #167) #if not defined MGONGPU_FPTYPE_DOUBLE and not defined MGONGPU_FPTYPE_FLOAT diff --git a/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt b/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt index a902df6bff..1d520f7648 100644 --- a/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt +++ b/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005403280258178711  +DEBUG: model prefixing takes 0.0055522918701171875  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=3: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g WEIGHTED<=3 @1 INFO: Process has 16 diagrams -1 processes with 16 diagrams generated in 0.022 s +1 processes with 16 diagrams generated in 0.021 s Total: 1 processes with 16 diagrams output madevent ../TMPOUT/CODEGEN_mad_gg_ttg --hel_recycling=False --vector_size=32 --me_exporter=standalone_cudacpp Load PLUGIN.CUDACPP_OUTPUT @@ -164,10 +164,10 @@ It has been validated for the last time with version: 3.5.2 Addition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT Output will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: cformat =  standalone_cudacpp [export_cpp.py at line 3071]  -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 160]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 161]  INFO: initialize a new directory: CODEGEN_mad_gg_ttg INFO: remove old information in CODEGEN_mad_gg_ttg -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 165]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 166]  WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg  INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards  @@ -177,7 +177,7 @@ INFO: Generating Helas calls for process: g g > t t~ g WEIGHTED<=3 @1 INFO: Processing color information for process: g g > t t~ g @1 INFO: Creating files in directory P1_gg_ttxg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1057]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -192,23 +192,23 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1871]  INFO: Generating Feynman diagrams for Process: g g > t t~ g WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttxg -Generated helas calls for 1 subprocesses (16 diagrams) in 0.038 s -Wrote files for 36 helas calls in 0.152 s +Generated helas calls for 1 subprocesses (16 diagrams) in 0.037 s +Wrote files for 36 helas calls in 0.148 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 5 routines in 0.334 s -DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202]  +ALOHA: aloha creates 5 routines in 0.327 s +DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 203]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 10 routines in 0.321 s +ALOHA: aloha creates 10 routines in 0.315 s VVV1 VVV1 FFV1 @@ -247,17 +247,17 @@ Hunk #2 succeeded at 159 (offset 16 lines). Hunk #3 succeeded at 237 (offset 16 lines). Hunk #4 succeeded at 265 (offset 16 lines). Hunk #5 succeeded at 310 (offset 16 lines). -DEBUG: p.returncode =  0 [output.py at line 237]  +DEBUG: p.returncode =  0 [output.py at line 238]  Output to directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg done. Type "launch" to generate events from this process, or see /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/README Run "open index.html" to see more information about this process. quit -real 0m2.252s -user 0m1.984s -sys 0m0.253s -Code generation completed in 3 seconds +real 0m2.204s +user 0m1.932s +sys 0m0.252s +Code generation completed in 2 seconds ************************************************************ * * * W E L C O M E to * diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/CurandRandomNumberKernel.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/CurandRandomNumberKernel.cc index 08a16f6f2c..c160c5e06b 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/CurandRandomNumberKernel.cc +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/CurandRandomNumberKernel.cc @@ -10,6 +10,7 @@ #include #ifndef MGONGPU_HAS_NO_CURAND /* clang-format off */ +// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when MGONGPUCPP_GPUIMPL is not defined #include "curand.h" #define checkCurand( code ){ assertCurand( code, __FILE__, __LINE__ ); } inline void assertCurand( curandStatus_t code, const char *file, int line, bool abort = true ) diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/HiprandRandomNumberKernel.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/HiprandRandomNumberKernel.cc new file mode 100644 index 0000000000..2e4534f9d4 --- /dev/null +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/HiprandRandomNumberKernel.cc @@ -0,0 +1,145 @@ +// Copyright (C) 2020-2024 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Jan 2024) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2024) for the MG5aMC CUDACPP plugin. + +#include "mgOnGpuConfig.h" + +#include "GpuRuntime.h" +#include "MemoryBuffers.h" +#include "RandomNumberKernels.h" + +#include + +#ifndef MGONGPU_HAS_NO_HIPRAND /* clang-format off */ +#ifndef __HIP_PLATFORM_AMD__ +#define __HIP_PLATFORM_AMD__ 1 // enable hiprand for AMD (rocrand) +#endif +#include +#define checkHiprand( code ){ assertHiprand( code, __FILE__, __LINE__ ); } +inline void assertHiprand( hiprandStatus_t code, const char *file, int line, bool abort = true ) +{ + if ( code != HIPRAND_STATUS_SUCCESS ) + { + printf( "HiprandAssert: %s:%d code=%d\n", file, line, code ); + if ( abort ) assert( code == HIPRAND_STATUS_SUCCESS ); + } +} +#endif /* clang-format on */ + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + //-------------------------------------------------------------------------- +#ifndef MGONGPU_HAS_NO_HIPRAND + HiprandRandomNumberKernel::HiprandRandomNumberKernel( BufferRndNumMomenta& rnarray, const bool onDevice ) + : RandomNumberKernelBase( rnarray ) + , m_isOnDevice( onDevice ) + { + if( m_isOnDevice ) + { +#ifdef MGONGPUCPP_GPUIMPL + if( !m_rnarray.isOnDevice() ) + throw std::runtime_error( "HiprandRandomNumberKernel on device with a host random number array" ); +#else + throw std::runtime_error( "HiprandRandomNumberKernel does not support HiprandDevice on CPU host" ); +#endif + } + else + { + if( m_rnarray.isOnDevice() ) + throw std::runtime_error( "HiprandRandomNumberKernel on host with a device random number array" ); + } + createGenerator(); + } + + //-------------------------------------------------------------------------- + + HiprandRandomNumberKernel::~HiprandRandomNumberKernel() + { + destroyGenerator(); + } + + //-------------------------------------------------------------------------- + + void HiprandRandomNumberKernel::seedGenerator( const unsigned int seed ) + { + if( m_isOnDevice ) + { + destroyGenerator(); // workaround for #429 + createGenerator(); // workaround for #429 + } + //printf( "seedGenerator: seed %d\n", seed ); + checkHiprand( hiprandSetPseudoRandomGeneratorSeed( m_rnGen, seed ) ); + } + + //-------------------------------------------------------------------------- + + void HiprandRandomNumberKernel::createGenerator() + { + //const hiprandRngType_t type = HIPRAND_RNG_PSEUDO_DEFAULT; + //const hiprandRngType_t type = HIPRAND_RNG_PSEUDO_XORWOW; + //const hiprandRngType_t type = HIPRAND_RNG_PSEUDO_MRG32K3A; + const hiprandRngType_t type = HIPRAND_RNG_PSEUDO_MTGP32; // same as curand; not implemented yet (code=1000) in host code + //const hiprandRngType_t type = HIPRAND_RNG_PSEUDO_MT19937; + //const hiprandRngType_t type = HIPRAND_RNG_PSEUDO_PHILOX4_32_10; + if( m_isOnDevice ) + { + checkHiprand( hiprandCreateGenerator( &m_rnGen, type ) ); + } + else + { + // See https://github.com/ROCm/hipRAND/issues/76 + throw std::runtime_error( "HiprandRandomNumberKernel on host is not supported yet (hiprandCreateGeneratorHost is not implemented yet)" ); + //checkHiprand( hiprandCreateGeneratorHost( &m_rnGen, type ) ); // ALWAYS FAILS WITH CODE=1000 + } + // FIXME: hiprand ordering is not implemented yet + // See https://github.com/ROCm/hipRAND/issues/75 + /* + //checkHiprand( hiprandSetGeneratorOrdering( *&m_rnGen, HIPRAND_ORDERING_PSEUDO_LEGACY ) ); + checkHiprand( hiprandSetGeneratorOrdering( *&m_rnGen, HIPRAND_ORDERING_PSEUDO_BEST ) ); + //checkHiprand( hiprandSetGeneratorOrdering( *&m_rnGen, HIPRAND_ORDERING_PSEUDO_DYNAMIC ) ); + //checkHiprand( hiprandSetGeneratorOrdering( *&m_rnGen, HIPRAND_ORDERING_PSEUDO_SEEDED ) ); + */ + } + + //-------------------------------------------------------------------------- + + void HiprandRandomNumberKernel::destroyGenerator() + { + checkHiprand( hiprandDestroyGenerator( m_rnGen ) ); + } + + //-------------------------------------------------------------------------- + + void HiprandRandomNumberKernel::generateRnarray() + { +#if defined MGONGPU_FPTYPE_DOUBLE + checkHiprand( hiprandGenerateUniformDouble( m_rnGen, m_rnarray.data(), m_rnarray.size() ) ); +#elif defined MGONGPU_FPTYPE_FLOAT + checkHiprand( hiprandGenerateUniform( m_rnGen, m_rnarray.data(), m_rnarray.size() ) ); +#endif + /* + printf( "\nHiprandRandomNumberKernel::generateRnarray size = %d\n", (int)m_rnarray.size() ); + fptype* data = m_rnarray.data(); +#ifdef MGONGPUCPP_GPUIMPL + if( m_rnarray.isOnDevice() ) + { + data = new fptype[m_rnarray.size()](); + checkCuda( cudaMemcpy( data, m_rnarray.data(), m_rnarray.bytes(), cudaMemcpyDeviceToHost ) ); + } +#endif + for( int i = 0; i < ( (int)m_rnarray.size() / 4 ); i++ ) + printf( "[%4d] %f %f %f %f\n", i * 4, data[i * 4], data[i * 4 + 2], data[i * 4 + 2], data[i * 4 + 3] ); +#ifdef MGONGPUCPP_GPUIMPL + if( m_rnarray.isOnDevice() ) delete[] data; +#endif + */ + } + + //-------------------------------------------------------------------------- +#endif +} diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/HiprandRandomNumberKernel.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/HiprandRandomNumberKernel.cc new file mode 120000 index 0000000000..6691864f78 --- /dev/null +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/HiprandRandomNumberKernel.cc @@ -0,0 +1 @@ +../HiprandRandomNumberKernel.cc \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/check_sa.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/check_sa.cc index bde384c69e..e086ae12d9 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/check_sa.cc +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/check_sa.cc @@ -1,10 +1,10 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2023 CERN and UCLouvain. +// Copyright (C) 2020-2024 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -58,7 +58,7 @@ int usage( char* argv0, int ret = 1 ) { std::cout << "Usage: " << argv0 - << " [--verbose|-v] [--debug|-d] [--performance|-p] [--json|-j] [--curhst|--curdev|--common] [--rmbhst|--rmbdev] [--bridge]" + << " [--verbose|-v] [--debug|-d] [--performance|-p] [--json|-j] [--curhst|--curdev|--hirhst|--hirdev|--common] [--rmbhst|--rmbdev] [--bridge]" << " [#gpuBlocksPerGrid #gpuThreadsPerBlock] #iterations" << std::endl; std::cout << std::endl; std::cout << "The number of events per iteration is #gpuBlocksPerGrid * #gpuThreadsPerBlock" << std::endl; @@ -131,17 +131,31 @@ main( int argc, char** argv ) enum class RandomNumberMode { CommonRandom = 0, - CurandHost = 1, - CurandDevice = 2 + CurandHost = -1, + CurandDevice = 1, + HiprandHost = -2, + HiprandDevice = 2 }; -#ifdef MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785) -#elif defined __HIPCC__ -#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random -#elif defined __CUDACC__ +#if defined __CUDACC__ +#ifndef MGONGPU_HAS_NO_CURAND RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand #else + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on NVidia GPU if build has no curand (PR #784 and #785) +#endif +#elif defined __HIPCC__ +#ifndef MGONGPU_HAS_NO_HIPRAND + RandomNumberMode rndgen = RandomNumberMode::HiprandDevice; // default on AMD GPU if build has hiprand +#else + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on AMD GPU if build has no hiprand +#endif +#else +#ifndef MGONGPU_HAS_NO_CURAND RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand +#elif not defined MGONGPU_HAS_NO_HIPRAND + RandomNumberMode rndgen = RandomNumberMode::HiprandDevice; // default on CPU if build has hiprand +#else + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has neither curand nor hiprand +#endif #endif // Rambo sampling mode (NB RamboHost implies CommonRandom or CurandHost!) enum class RamboSamplingMode @@ -152,7 +166,7 @@ main( int argc, char** argv ) #ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -193,6 +207,26 @@ main( int argc, char** argv ) throw std::runtime_error( "CurandHost is not supported because this application was built without Curand support" ); #else rndgen = RandomNumberMode::CurandHost; +#endif + } + else if( arg == "--hirdev" ) + { +#ifndef __HIPCC__ + throw std::runtime_error( "HiprandDevice is not supported on CPUs or non-AMD GPUs" ); +#elif defined MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "HiprandDevice is not supported because this application was built without Hiprand support" ); +#else + rndgen = RandomNumberMode::HiprandDevice; +#endif + } + else if( arg == "--hirhst" ) + { +#ifdef MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "HiprandHost is not supported because this application was built without Hiprand support" ); +#else + // See https://github.com/ROCm/hipRAND/issues/76 + throw std::runtime_error( "HiprandRandomNumberKernel on host is not supported yet (hiprandCreateGeneratorHost is not implemented yet)" ); + //rndgen = RandomNumberMode::HiprandHost; #endif } else if( arg == "--common" ) @@ -265,6 +299,20 @@ main( int argc, char** argv ) #endif } + if( rmbsmp == RamboSamplingMode::RamboHost && rndgen == RandomNumberMode::HiprandDevice ) + { +#if not defined MGONGPU_HAS_NO_HIPRAND + // See https://github.com/ROCm/hipRAND/issues/76 + //std::cout << "WARNING! RamboHost selected: cannot use HiprandDevice, will use HiprandHost" << std::endl; + //rndgen = RandomNumberMode::HiprandHost; + std::cout << "WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet)" << std::endl; + rndgen = RandomNumberMode::CommonRandom; +#else + std::cout << "WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom" << std::endl; + rndgen = RandomNumberMode::CommonRandom; +#endif + } + constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout constexpr int neppR = MemoryAccessRandomNumbers::neppR; // AOSOA layout @@ -415,7 +463,7 @@ main( int argc, char** argv ) std::unique_ptr wavetimes( new double[niter] ); std::unique_ptr wv3atimes( new double[niter] ); - // --- 0c. Create curand or common generator + // --- 0c. Create curand, hiprand or common generator const std::string cgenKey = "0c GenCreat"; timermap.start( cgenKey ); // Allocate the appropriate RandomNumberKernel @@ -433,7 +481,7 @@ main( int argc, char** argv ) prnk.reset( new CurandRandomNumberKernel( hstRndmom, onDevice ) ); #endif } - else + else if( rndgen == RandomNumberMode::CurandDevice ) { #ifdef MGONGPU_HAS_NO_CURAND throw std::runtime_error( "INTERNAL ERROR! CurandDevice is not supported because this application was built without Curand support" ); // INTERNAL ERROR (no path to this statement) @@ -441,9 +489,31 @@ main( int argc, char** argv ) const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); #else - throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif } + else if( rndgen == RandomNumberMode::HiprandHost ) + { +#ifdef MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "INTERNAL ERROR! HiprandHost is not supported because this application was built without Hiprand support" ); // INTERNAL ERROR (no path to this statement) +#else + const bool onDevice = false; + prnk.reset( new HiprandRandomNumberKernel( hstRndmom, onDevice ) ); +#endif + } + else if( rndgen == RandomNumberMode::HiprandDevice ) + { +#ifdef MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "INTERNAL ERROR! HiprandDevice is not supported because this application was built without Hiprand support" ); // INTERNAL ERROR (no path to this statement) +#elif defined __HIPCC__ + const bool onDevice = true; + prnk.reset( new HiprandRandomNumberKernel( devRndmom, onDevice ) ); +#else + throw std::logic_error( "INTERNAL ERROR! HiprandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) +#endif + } + else + throw std::logic_error( "INTERNAL ERROR! Unknown rndgen value?" ); // INTERNAL ERROR (no path to this statement) // --- 0c. Create rambo sampling kernel [keep this in 0c for the moment] std::unique_ptr prsk; @@ -497,7 +567,7 @@ main( int argc, char** argv ) // *** START THE OLD-STYLE TIMER FOR RANDOM GEN *** double genrtime = 0; - // --- 1a. Seed rnd generator (to get same results on host and device in curand) + // --- 1a. Seed rnd generator (to get same results on host and device in curand/hiprand) // [NB This should not be necessary using the host API: "Generation functions // can be called multiple times on the same generator to generate successive // blocks of results. For pseudorandom generators, multiple calls to generation @@ -515,7 +585,9 @@ main( int argc, char** argv ) //std::cout << "Got random numbers" << std::endl; #ifdef MGONGPUCPP_GPUIMPL - if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) + if( rndgen != RandomNumberMode::CurandDevice && + rndgen != RandomNumberMode::HiprandDevice && + rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device const std::string htodKey = "1c CpHTDrnd"; @@ -761,6 +833,10 @@ main( int argc, char** argv ) rndgentxt = "CURAND HOST"; else if( rndgen == RandomNumberMode::CurandDevice ) rndgentxt = "CURAND DEVICE"; + else if( rndgen == RandomNumberMode::HiprandHost ) + rndgentxt = "ROCRAND HOST"; + else if( rndgen == RandomNumberMode::HiprandDevice ) + rndgentxt = "ROCRAND DEVICE"; #ifdef __CUDACC__ rndgentxt += " (CUDA code)"; #elif defined __HIPCC__ @@ -788,7 +864,7 @@ main( int argc, char** argv ) wrkflwtxt += "FLT+"; #else wrkflwtxt += "???+"; // no path to this statement -#endif /* clang-format on */ +#endif // -- CUCOMPLEX or THRUST or STD or CXSIMPLE complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -813,7 +889,7 @@ main( int argc, char** argv ) wrkflwtxt += "CXS:"; #else wrkflwtxt += "???:"; // no path to this statement -#endif +#endif /* clang-format on */ #endif // -- COMMON or CURAND HOST or CURAND DEVICE random numbers? if( rndgen == RandomNumberMode::CommonRandom ) @@ -822,6 +898,10 @@ main( int argc, char** argv ) wrkflwtxt += "CURHST+"; else if( rndgen == RandomNumberMode::CurandDevice ) wrkflwtxt += "CURDEV+"; + else if( rndgen == RandomNumberMode::HiprandHost ) + wrkflwtxt += "HIRHST+"; + else if( rndgen == RandomNumberMode::HiprandDevice ) + wrkflwtxt += "HIRDEV+"; else wrkflwtxt += "??????+"; // no path to this statement // -- HOST or DEVICE rambo sampling? @@ -1099,7 +1179,7 @@ main( int argc, char** argv ) #ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif - << "\"Curand generation\": " + << "\"Random generation\": " << "\"" << rndgentxt << "\"," << std::endl; double minelem = hstStats.minME; diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/RandomNumberKernels.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/RandomNumberKernels.h index 21d63beeac..7ed728a26c 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/RandomNumberKernels.h +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/RandomNumberKernels.h @@ -1,21 +1,22 @@ -// Copyright (C) 2020-2023 CERN and UCLouvain. +// Copyright (C) 2020-2024 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. #ifndef RANDOMNUMBERKERNELS_H #define RANDOMNUMBERKERNELS_H 1 #include "mgOnGpuConfig.h" -// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when MGONGPUCPP_GPUIMPL is not defined -#ifndef MGONGPU_HAS_NO_CURAND -//#include "curand.h" -struct curandGenerator_st; // forward definition from curand.h -#endif - #include "MemoryBuffers.h" +// Forward definition from curand.h (the full header is only needed in CurandRandomKernel.cc) +struct curandGenerator_st; + +// Forward definition from hiprand.h (the full header is only needed in HiprandRandomKernel.cc) +struct rocrand_generator_base_type; +typedef rocrand_generator_base_type hiprandGenerator_st; + #ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else @@ -107,7 +108,6 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef MGONGPU_HAS_NO_CURAND // A class encapsulating CURAND random number generation on a CPU host or on a GPU device class CurandRandomNumberKernel final : public RandomNumberKernelBase { @@ -142,11 +142,49 @@ namespace mg5amcCpu const bool m_isOnDevice; // The curand generator - // (NB: curand.h defines typedef generator_t as a pointer to forward-defined 'struct curandGenerator_st') + // (NB: curand.h defines typedef curandGenerator_t as a pointer to forward-defined 'struct curandGenerator_st') curandGenerator_st* m_rnGen; }; -#endif + //-------------------------------------------------------------------------- + + // A class encapsulating HIPRAND random number generation on a CPU host or on a GPU device + class HiprandRandomNumberKernel final : public RandomNumberKernelBase + { + public: + + // Constructor from an existing output buffer + HiprandRandomNumberKernel( BufferRndNumMomenta& rnarray, const bool onDevice ); + + // Destructor + ~HiprandRandomNumberKernel(); + + // Seed the random number generator + void seedGenerator( const unsigned int seed ) override final; + + // Generate the random number array + void generateRnarray() override final; + + // Is this a host or device kernel? + bool isOnDevice() const override final { return m_isOnDevice; } + + private: + + // Create the generator (workaround for #429: do this in every seedGenerator call rather than only in the ctor) + void createGenerator(); + + // Destroy the generator (workaround for #429: do this in every seedGenerator call rather than only in the ctor) + void destroyGenerator(); + + private: + + // Is this a host or device kernel? + const bool m_isOnDevice; + + // The hiprand generator + // (NB: hiprand.h defines typedef hiprandGenerator_t as a pointer to forward-defined 'struct hiprandGenerator_st') + hiprandGenerator_st* m_rnGen; + }; //-------------------------------------------------------------------------- } diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk index 1077bdc098..3ad91dfd59 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk @@ -173,11 +173,6 @@ ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) comma:=, CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch)) CUINC = -I$(CUDA_HOME)/include/ - ifeq ($(RNDGEN),hasNoCurand) - CURANDLIBFLAGS= - else - CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here! - endif CUOPTFLAGS = -lineinfo ###GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math GPUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math @@ -241,7 +236,7 @@ else override GPUCC= override USE_NVTX= override CUINC= - override CURANDLIBFLAGS= + override HIPINC= endif @@ -291,7 +286,7 @@ endif #------------------------------------------------------------------------------- -#=== Configure defaults and check if user-defined choices exist for OMPFLAGS, AVX, FPTYPE, HELINL, HRDCOD, RNDGEN +#=== Configure defaults and check if user-defined choices exist for OMPFLAGS, AVX, FPTYPE, HELINL, HRDCOD # Set the default OMPFLAGS choice ifneq ($(findstring hipcc,$(GPUCC)),) @@ -352,29 +347,62 @@ ifeq ($(HRDCOD),) override HRDCOD = 0 endif -# Set the default RNDGEN (random number generator) choice -ifeq ($(RNDGEN),) - ifeq ($(GPUCC),) - override RNDGEN = hasNoCurand - # Edgecase for HIP compilation - else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) - override RNDGEN = hasNoCurand - else ifeq ($(RNDGEN),) - override RNDGEN = hasCurand - endif -endif - -# Export AVX, FPTYPE, HELINL, HRDCOD, RNDGEN, OMPFLAGS so that it is not necessary to pass them to the src Makefile too +# Export AVX, FPTYPE, HELINL, HRDCOD, OMPFLAGS so that it is not necessary to pass them to the src Makefile too export AVX export FPTYPE export HELINL export HRDCOD -export RNDGEN export OMPFLAGS #------------------------------------------------------------------------------- -#=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN +#=== Configure defaults and check if user-defined choices exist for RNDGEN (legacy!), HASCURAND, HASHIPRAND + +# If the legacy RNDGEN exists, this take precedence over any HASCURAND choice (but a warning is printed out) +###$(info RNDGEN=$(RNDGEN)) +ifneq ($(RNDGEN),) + $(warning Environment variable RNDGEN is no longer supported, please use HASCURAND instead!) + ifeq ($(RNDGEN),hasCurand) + override HASCURAND = $(RNDGEN) + else ifeq ($(RNDGEN),hasNoCurand) + override HASCURAND = $(RNDGEN) + else ifneq ($(RNDGEN),hasNoCurand) + $(error Unknown RNDGEN='$(RNDGEN)': only 'hasCurand' and 'hasNoCurand' are supported - but use HASCURAND instead!) + endif +endif + +# Set the default HASCURAND (curand random number generator) choice, if no prior choice exists for HASCURAND +# (NB: allow HASCURAND=hasCurand even if $(GPUCC) does not point to nvcc: assume CUDA_HOME was defined correctly...) +ifeq ($(HASCURAND),) + ifeq ($(GPUCC),) # CPU-only build + override HASCURAND = hasNoCurand + else ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build + override HASCURAND = hasCurand + else # non-Nvidia GPU build + override HASCURAND = hasNoCurand + endif +endif + +# Set the default HASHIPRAND (hiprand random number generator) choice, if no prior choice exists for HASHIPRAND +# (NB: allow HASHIPRAND=hasHiprand even if $(GPUCC) does not point to hipcc: assume HIP_HOME was defined correctly...) +ifeq ($(HASHIPRAND),) + ifeq ($(GPUCC),) # CPU-only build + override HASHIPRAND = hasNoHiprand + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build + override HASHIPRAND = hasHiprand + else # non-AMD GPU build + override HASHIPRAND = hasNoHiprand + endif +endif + +# Export HASCURAND, HASHIPRAND so that it is not necessary to pass them to the src Makefile too +# (NB: these variables in cudacpp_src.mk are only used to define the build tag, they are NOT needed for RNDCXXFLAGS or RNDLIBFLAGS) +export HASCURAND +export HASHIPRAND + +#------------------------------------------------------------------------------- + +#=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD # Set the build flags appropriate to OMPFLAGS $(info OMPFLAGS=$(OMPFLAGS)) @@ -432,13 +460,13 @@ CXXFLAGS+= $(AVXFLAGS) $(info FPTYPE=$(FPTYPE)) ifeq ($(FPTYPE),d) CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE - GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE + GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE else ifeq ($(FPTYPE),f) CXXFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT - GPUFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT + GPUFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT else ifeq ($(FPTYPE),m) CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT - GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT + GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT else $(error Unknown FPTYPE='$(FPTYPE)': only 'd', 'f' and 'm' are supported) endif @@ -447,7 +475,7 @@ endif $(info HELINL=$(HELINL)) ifeq ($(HELINL),1) CXXFLAGS += -DMGONGPU_INLINE_HELAMPS - GPUFLAGS += -DMGONGPU_INLINE_HELAMPS + GPUFLAGS += -DMGONGPU_INLINE_HELAMPS else ifneq ($(HELINL),0) $(error Unknown HELINL='$(HELINL)': only '0' and '1' are supported) endif @@ -456,21 +484,40 @@ endif $(info HRDCOD=$(HRDCOD)) ifeq ($(HRDCOD),1) CXXFLAGS += -DMGONGPU_HARDCODE_PARAM - GPUFLAGS += -DMGONGPU_HARDCODE_PARAM + GPUFLAGS += -DMGONGPU_HARDCODE_PARAM else ifneq ($(HRDCOD),0) $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported) endif -# Set the build flags appropriate to each RNDGEN choice (example: "make RNDGEN=hasNoCurand") -$(info RNDGEN=$(RNDGEN)) -ifeq ($(RNDGEN),hasNoCurand) - override CXXFLAGSCURAND = -DMGONGPU_HAS_NO_CURAND -else ifeq ($(RNDGEN),hasCurand) - override CXXFLAGSCURAND = + +#=== Set the CUDA/HIP/C++ compiler and linker flags appropriate to user-defined choices of HASCURAND, HASHIPRAND + +$(info HASCURAND=$(HASCURAND)) +$(info HASHIPRAND=$(HASHIPRAND)) +override RNDCXXFLAGS= +override RNDLIBFLAGS= + +# Set the RNDCXXFLAGS and RNDLIBFLAGS build flags appropriate to each HASCURAND choice (example: "make HASCURAND=hasNoCurand") +ifeq ($(HASCURAND),hasNoCurand) + override RNDCXXFLAGS += -DMGONGPU_HAS_NO_CURAND +else ifeq ($(HASCURAND),hasCurand) + override RNDLIBFLAGS += -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here! else - $(error Unknown RNDGEN='$(RNDGEN)': only 'hasCurand' and 'hasNoCurand' are supported) + $(error Unknown HASCURAND='$(HASCURAND)': only 'hasCurand' and 'hasNoCurand' are supported) +endif + +# Set the RNDCXXFLAGS and RNDLIBFLAGS build flags appropriate to each HASHIPRAND choice (example: "make HASHIPRAND=hasNoHiprand") +ifeq ($(HASHIPRAND),hasNoHiprand) + override RNDCXXFLAGS += -DMGONGPU_HAS_NO_HIPRAND +else ifeq ($(HASHIPRAND),hasHiprand) + override RNDLIBFLAGS += -L$(HIP_HOME)/lib/ -lhiprand +else ifneq ($(HASHIPRAND),hasHiprand) + $(error Unknown HASHIPRAND='$(HASHIPRAND)': only 'hasHiprand' and 'hasNoHiprand' are supported) endif +#$(info RNDCXXFLAGS=$(RNDCXXFLAGS)) +#$(info HASHIPRAND=$(HASHIPRAND)) + #------------------------------------------------------------------------------- #=== Configure build directories and build lockfiles === @@ -481,7 +528,7 @@ override DIRTAG = $(AVX)_$(FPTYPE)_inl$(HELINL)_hrd$(HRDCOD) # Build lockfile "full" tag (defines full specification of build options that cannot be intermixed) # (Rationale: avoid mixing of CUDA and no-CUDA environment builds with different random number generators) -override TAG = $(AVX)_$(FPTYPE)_inl$(HELINL)_hrd$(HRDCOD)_$(RNDGEN) +override TAG = $(AVX)_$(FPTYPE)_inl$(HELINL)_hrd$(HRDCOD)_$(HASCURAND)_$(HASHIPRAND) # Build directory: current directory by default, or build.$(DIRTAG) if USEBUILDDIR==1 ifeq ($(USEBUILDDIR),1) @@ -589,14 +636,19 @@ endif $(BUILDDIR)/check_sa.o: CXXFLAGS += $(USE_NVTX) $(CUINC) $(BUILDDIR)/check_sa_cu.o: CXXFLAGS += $(USE_NVTX) $(CUINC) -# Apply special build flags only to check_sa[_cu].o and CurandRandomNumberKernel[_cu].o (curand headers, #679) -$(BUILDDIR)/check_sa.o: CXXFLAGS += $(CXXFLAGSCURAND) -$(BUILDDIR)/check_sa_cu.o: CUFLAGS += $(CXXFLAGSCURAND) -$(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CXXFLAGSCURAND) -$(BUILDDIR)/CurandRandomNumberKernel_cu.o: CUFLAGS += $(CXXFLAGSCURAND) -ifeq ($(RNDGEN),hasCurand) +# Apply special build flags only to check_sa[_cu].o and (Cu|Hip)randRandomNumberKernel[_cu].o +$(BUILDDIR)/check_sa.o: CXXFLAGS += $(RNDCXXFLAGS) +$(BUILDDIR)/check_sa_cu.o: CUFLAGS += $(RNDCXXFLAGS) +$(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(RNDCXXFLAGS) +$(BUILDDIR)/CurandRandomNumberKernel_cu.o: CUFLAGS += $(RNDCXXFLAGS) +$(BUILDDIR)/HiprandRandomNumberKernel.o: CXXFLAGS += $(RNDCXXFLAGS) +$(BUILDDIR)/HiprandRandomNumberKernel_cu.o: CUFLAGS += $(RNDCXXFLAGS) +ifeq ($(HASCURAND),hasCurand) # curand headers, #679 $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUINC) endif +ifeq ($(HASHIPRAND),hasHiprand) # hiprand headers +$(BUILDDIR)/HiprandRandomNumberKernel.o: CXXFLAGS += $(HIPINC) +endif # Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592) ifneq ($(shell $(CXX) --version | egrep '^(Intel)'),) @@ -673,8 +725,8 @@ endif # Target (and build rules): C++ and CUDA standalone executables $(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o - $(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CURANDLIBFLAGS) +$(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(BUILDDIR)/HiprandRandomNumberKernel.o + $(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(BUILDDIR)/HiprandRandomNumberKernel.o $(RNDLIBFLAGS) ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) @@ -684,8 +736,8 @@ else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(cu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif $(cu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(cu_main): $(BUILDDIR)/check_sa_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_cu.o - $(GPUCC) -o $@ $(BUILDDIR)/check_sa_cu.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_cu.o $(CURANDLIBFLAGS) +$(cu_main): $(BUILDDIR)/check_sa_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_cu.o $(BUILDDIR)/HiprandRandomNumberKernel_cu.o + $(GPUCC) -o $@ $(BUILDDIR)/check_sa_cu.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_cu.o $(BUILDDIR)/HiprandRandomNumberKernel_cu.o $(RNDLIBFLAGS) endif #------------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/runTest.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/runTest.cc index de327f2321..7f248d29a4 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/runTest.cc +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/runTest.cc @@ -238,7 +238,7 @@ struct CUDATest : public CUDA_CPU_TestBase return MemoryAccessMatrixElements::ieventAccessConst( hstMatrixElements.data(), ievt ); } }; -#endif +#endif /* clang-format off */ // Use two levels of macros to force stringification at the right level // (see https://gcc.gnu.org/onlinedocs/gcc-3.0.1/cpp_3.html#SEC17 and https://stackoverflow.com/a/3419392) @@ -260,4 +260,4 @@ INSTANTIATE_TEST_SUITE_P( prefix, \ MG_INSTANTIATE_TEST_SUITE_GPU( XTESTID_GPU( MG_EPOCH_PROCESS_ID ), MadgraphTest ); #else MG_INSTANTIATE_TEST_SUITE_CPU( XTESTID_CPU( MG_EPOCH_PROCESS_ID ), MadgraphTest ); -#endif +#endif /* clang-format on */ diff --git a/epochX/cudacpp/gg_ttg.mad/src/cudacpp_src.mk b/epochX/cudacpp/gg_ttg.mad/src/cudacpp_src.mk index b2b9da5288..d7d51259d2 100644 --- a/epochX/cudacpp/gg_ttg.mad/src/cudacpp_src.mk +++ b/epochX/cudacpp/gg_ttg.mad/src/cudacpp_src.mk @@ -45,13 +45,13 @@ endif #------------------------------------------------------------------------------- -#=== Configure the CUDA compiler (note: GPUCC is already exported including ccache) +#=== Configure the CUDA compiler (note: GPUCC have been exported from cudacpp.mk including ccache) ###$(info GPUCC=$(GPUCC)) #------------------------------------------------------------------------------- -#=== Configure ccache for C++ builds (note: GPUCC is already exported including ccache) +#=== Configure ccache for C++ builds (note: GPUCC have been exported from cudacpp.mk including ccache) # Enable ccache if USECCACHE=1 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) @@ -86,7 +86,8 @@ endif #------------------------------------------------------------------------------- -#=== Set the CUDA/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN +#=== Set the CUDA/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD (exported from cudacpp.mk) +#=== (NB the RNDCXXFLAGS and RNDLIBFLAGS appropriate to user-defined choices of HASCURAND and HASHIPRAND have been exported from cudacpp.mk) # Set the build flags appropriate to OMPFLAGS ###$(info OMPFLAGS=$(OMPFLAGS)) @@ -175,14 +176,6 @@ else ifneq ($(HRDCOD),0) $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported) endif -# Set the build flags appropriate to each RNDGEN choice (example: "make RNDGEN=hasNoCurand") -###$(info RNDGEN=$(RNDGEN)) -ifeq ($(RNDGEN),hasNoCurand) - CXXFLAGS += -DMGONGPU_HAS_NO_CURAND -else ifneq ($(RNDGEN),hasCurand) - $(error Unknown RNDGEN='$(RNDGEN)': only 'hasCurand' and 'hasNoCurand' are supported) -endif - #------------------------------------------------------------------------------- #=== Configure build directories and build lockfiles === @@ -193,7 +186,7 @@ override DIRTAG = $(AVX)_$(FPTYPE)_inl$(HELINL)_hrd$(HRDCOD) # Build lockfile "full" tag (defines full specification of build options that cannot be intermixed) # (Rationale: avoid mixing of CUDA and no-CUDA environment builds with different random number generators) -override TAG = $(AVX)_$(FPTYPE)_inl$(HELINL)_hrd$(HRDCOD)_$(RNDGEN) +override TAG = $(AVX)_$(FPTYPE)_inl$(HELINL)_hrd$(HRDCOD)_$(HASCURAND)_$(HASHIPRAND) # Build directory: current directory by default, or build.$(DIRTAG) if USEBUILDDIR==1 ###$(info Current directory is $(shell pwd)) diff --git a/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuConfig.h index 6bde4466d0..33306bc3e2 100644 --- a/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuConfig.h @@ -20,11 +20,15 @@ #undef MGONGPUCPP_GPUIMPL #endif +// Make sure that __HIP_PLATFORM_NVIDIA__ is undefined +// (__HIP_PLATFORM_AMD__ is defined by hipcc or in HiprandRandomNumberKernel.cc) +#undef __HIP_PLATFORM_NVIDIA__ // disable hiprand for NVidia (curand) + // ** NB1 Throughputs (e.g. 6.8E8) are events/sec for "./gcheck.exe -p 65536 128 12" // ** NB2 Baseline on b7g47n0004 fluctuates (probably depends on load on other VMs) // Choose if curand is supported for generating random numbers -// For HIP, by default, do not use curand (common random numbers will be used instead) +// For HIP, by default, do not allow curand to be used (hiprand or common random numbers will be used instead) // For both CUDA and C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND // (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784 and #785) #if defined __HIPCC__ @@ -39,6 +43,22 @@ //#endif #endif +// Choose if hiprand is supported for generating random numbers +// For CUDA, by default, do not allow hiprand to be used (curand or common random numbers will be used instead) +// For both HIP and C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_HIPRAND +// (there may exist HIP installations which do not include hiprand?) +#if defined __CUDACC__ +#define MGONGPU_HAS_NO_HIPRAND 1 +#else +//#ifdef __HIPCC__ +//#undef MGONGPU_HAS_NO_HIPRAND // default +////#define MGONGPU_HAS_NO_HIPRAND 1 +//#else +//#undef MGONGPU_HAS_NO_HIPRAND // default +////#define MGONGPU_HAS_NO_HIPRAND 1 +//#endif +#endif + // Choose floating point precision (for everything but color algebra #537) // If one of these macros has been set from outside with e.g. -DMGONGPU_FPTYPE_FLOAT, nothing happens (issue #167) #if not defined MGONGPU_FPTYPE_DOUBLE and not defined MGONGPU_FPTYPE_FLOAT diff --git a/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt b/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt index f019f6812d..26d60e142d 100644 --- a/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt +++ b/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005845308303833008  +DEBUG: model prefixing takes 0.0054738521575927734  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=3: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g WEIGHTED<=3 @1 INFO: Process has 16 diagrams -1 processes with 16 diagrams generated in 0.023 s +1 processes with 16 diagrams generated in 0.021 s Total: 1 processes with 16 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_ttg Load PLUGIN.CUDACPP_OUTPUT @@ -163,29 +163,29 @@ Load PLUGIN.CUDACPP_OUTPUT It has been validated for the last time with version: 3.5.2 Output will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: cformat =  plugin [export_cpp.py at line 3071]  -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 160]  -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 165]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 161]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 166]  INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ g WEIGHTED<=3 @1 INFO: Processing color information for process: g g > t t~ g @1 -DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 194]  -DEBUG: type(subproc_group)= [output.py at line 195]  -DEBUG: type(fortran_model)= [output.py at line 196]  -DEBUG: type(me)= me=0 [output.py at line 197]  +DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 195]  +DEBUG: type(subproc_group)= [output.py at line 196]  +DEBUG: type(fortran_model)= [output.py at line 197]  +DEBUG: type(me)= me=0 [output.py at line 198]  INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/./CPPProcess.h FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/./CPPProcess.cc INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/. -Generated helas calls for 1 subprocesses (16 diagrams) in 0.040 s -DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202]  +Generated helas calls for 1 subprocesses (16 diagrams) in 0.037 s +DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 203]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 5 routines in 0.335 s +ALOHA: aloha creates 5 routines in 0.323 s VVV1 VVV1 FFV1 @@ -205,7 +205,7 @@ INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/. quit -real 0m1.128s -user 0m0.748s -sys 0m0.062s +real 0m0.778s +user 0m0.718s +sys 0m0.051s Code generation completed in 1 seconds diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/CurandRandomNumberKernel.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/CurandRandomNumberKernel.cc index 08a16f6f2c..c160c5e06b 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/CurandRandomNumberKernel.cc +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/CurandRandomNumberKernel.cc @@ -10,6 +10,7 @@ #include #ifndef MGONGPU_HAS_NO_CURAND /* clang-format off */ +// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when MGONGPUCPP_GPUIMPL is not defined #include "curand.h" #define checkCurand( code ){ assertCurand( code, __FILE__, __LINE__ ); } inline void assertCurand( curandStatus_t code, const char *file, int line, bool abort = true ) diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/HiprandRandomNumberKernel.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/HiprandRandomNumberKernel.cc new file mode 100644 index 0000000000..2e4534f9d4 --- /dev/null +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/HiprandRandomNumberKernel.cc @@ -0,0 +1,145 @@ +// Copyright (C) 2020-2024 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Jan 2024) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2024) for the MG5aMC CUDACPP plugin. + +#include "mgOnGpuConfig.h" + +#include "GpuRuntime.h" +#include "MemoryBuffers.h" +#include "RandomNumberKernels.h" + +#include + +#ifndef MGONGPU_HAS_NO_HIPRAND /* clang-format off */ +#ifndef __HIP_PLATFORM_AMD__ +#define __HIP_PLATFORM_AMD__ 1 // enable hiprand for AMD (rocrand) +#endif +#include +#define checkHiprand( code ){ assertHiprand( code, __FILE__, __LINE__ ); } +inline void assertHiprand( hiprandStatus_t code, const char *file, int line, bool abort = true ) +{ + if ( code != HIPRAND_STATUS_SUCCESS ) + { + printf( "HiprandAssert: %s:%d code=%d\n", file, line, code ); + if ( abort ) assert( code == HIPRAND_STATUS_SUCCESS ); + } +} +#endif /* clang-format on */ + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + //-------------------------------------------------------------------------- +#ifndef MGONGPU_HAS_NO_HIPRAND + HiprandRandomNumberKernel::HiprandRandomNumberKernel( BufferRndNumMomenta& rnarray, const bool onDevice ) + : RandomNumberKernelBase( rnarray ) + , m_isOnDevice( onDevice ) + { + if( m_isOnDevice ) + { +#ifdef MGONGPUCPP_GPUIMPL + if( !m_rnarray.isOnDevice() ) + throw std::runtime_error( "HiprandRandomNumberKernel on device with a host random number array" ); +#else + throw std::runtime_error( "HiprandRandomNumberKernel does not support HiprandDevice on CPU host" ); +#endif + } + else + { + if( m_rnarray.isOnDevice() ) + throw std::runtime_error( "HiprandRandomNumberKernel on host with a device random number array" ); + } + createGenerator(); + } + + //-------------------------------------------------------------------------- + + HiprandRandomNumberKernel::~HiprandRandomNumberKernel() + { + destroyGenerator(); + } + + //-------------------------------------------------------------------------- + + void HiprandRandomNumberKernel::seedGenerator( const unsigned int seed ) + { + if( m_isOnDevice ) + { + destroyGenerator(); // workaround for #429 + createGenerator(); // workaround for #429 + } + //printf( "seedGenerator: seed %d\n", seed ); + checkHiprand( hiprandSetPseudoRandomGeneratorSeed( m_rnGen, seed ) ); + } + + //-------------------------------------------------------------------------- + + void HiprandRandomNumberKernel::createGenerator() + { + //const hiprandRngType_t type = HIPRAND_RNG_PSEUDO_DEFAULT; + //const hiprandRngType_t type = HIPRAND_RNG_PSEUDO_XORWOW; + //const hiprandRngType_t type = HIPRAND_RNG_PSEUDO_MRG32K3A; + const hiprandRngType_t type = HIPRAND_RNG_PSEUDO_MTGP32; // same as curand; not implemented yet (code=1000) in host code + //const hiprandRngType_t type = HIPRAND_RNG_PSEUDO_MT19937; + //const hiprandRngType_t type = HIPRAND_RNG_PSEUDO_PHILOX4_32_10; + if( m_isOnDevice ) + { + checkHiprand( hiprandCreateGenerator( &m_rnGen, type ) ); + } + else + { + // See https://github.com/ROCm/hipRAND/issues/76 + throw std::runtime_error( "HiprandRandomNumberKernel on host is not supported yet (hiprandCreateGeneratorHost is not implemented yet)" ); + //checkHiprand( hiprandCreateGeneratorHost( &m_rnGen, type ) ); // ALWAYS FAILS WITH CODE=1000 + } + // FIXME: hiprand ordering is not implemented yet + // See https://github.com/ROCm/hipRAND/issues/75 + /* + //checkHiprand( hiprandSetGeneratorOrdering( *&m_rnGen, HIPRAND_ORDERING_PSEUDO_LEGACY ) ); + checkHiprand( hiprandSetGeneratorOrdering( *&m_rnGen, HIPRAND_ORDERING_PSEUDO_BEST ) ); + //checkHiprand( hiprandSetGeneratorOrdering( *&m_rnGen, HIPRAND_ORDERING_PSEUDO_DYNAMIC ) ); + //checkHiprand( hiprandSetGeneratorOrdering( *&m_rnGen, HIPRAND_ORDERING_PSEUDO_SEEDED ) ); + */ + } + + //-------------------------------------------------------------------------- + + void HiprandRandomNumberKernel::destroyGenerator() + { + checkHiprand( hiprandDestroyGenerator( m_rnGen ) ); + } + + //-------------------------------------------------------------------------- + + void HiprandRandomNumberKernel::generateRnarray() + { +#if defined MGONGPU_FPTYPE_DOUBLE + checkHiprand( hiprandGenerateUniformDouble( m_rnGen, m_rnarray.data(), m_rnarray.size() ) ); +#elif defined MGONGPU_FPTYPE_FLOAT + checkHiprand( hiprandGenerateUniform( m_rnGen, m_rnarray.data(), m_rnarray.size() ) ); +#endif + /* + printf( "\nHiprandRandomNumberKernel::generateRnarray size = %d\n", (int)m_rnarray.size() ); + fptype* data = m_rnarray.data(); +#ifdef MGONGPUCPP_GPUIMPL + if( m_rnarray.isOnDevice() ) + { + data = new fptype[m_rnarray.size()](); + checkCuda( cudaMemcpy( data, m_rnarray.data(), m_rnarray.bytes(), cudaMemcpyDeviceToHost ) ); + } +#endif + for( int i = 0; i < ( (int)m_rnarray.size() / 4 ); i++ ) + printf( "[%4d] %f %f %f %f\n", i * 4, data[i * 4], data[i * 4 + 2], data[i * 4 + 2], data[i * 4 + 3] ); +#ifdef MGONGPUCPP_GPUIMPL + if( m_rnarray.isOnDevice() ) delete[] data; +#endif + */ + } + + //-------------------------------------------------------------------------- +#endif +} diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/HiprandRandomNumberKernel.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/HiprandRandomNumberKernel.cc new file mode 120000 index 0000000000..6691864f78 --- /dev/null +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/HiprandRandomNumberKernel.cc @@ -0,0 +1 @@ +../HiprandRandomNumberKernel.cc \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/check_sa.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/check_sa.cc index bde384c69e..e086ae12d9 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/check_sa.cc +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/check_sa.cc @@ -1,10 +1,10 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2023 CERN and UCLouvain. +// Copyright (C) 2020-2024 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -58,7 +58,7 @@ int usage( char* argv0, int ret = 1 ) { std::cout << "Usage: " << argv0 - << " [--verbose|-v] [--debug|-d] [--performance|-p] [--json|-j] [--curhst|--curdev|--common] [--rmbhst|--rmbdev] [--bridge]" + << " [--verbose|-v] [--debug|-d] [--performance|-p] [--json|-j] [--curhst|--curdev|--hirhst|--hirdev|--common] [--rmbhst|--rmbdev] [--bridge]" << " [#gpuBlocksPerGrid #gpuThreadsPerBlock] #iterations" << std::endl; std::cout << std::endl; std::cout << "The number of events per iteration is #gpuBlocksPerGrid * #gpuThreadsPerBlock" << std::endl; @@ -131,17 +131,31 @@ main( int argc, char** argv ) enum class RandomNumberMode { CommonRandom = 0, - CurandHost = 1, - CurandDevice = 2 + CurandHost = -1, + CurandDevice = 1, + HiprandHost = -2, + HiprandDevice = 2 }; -#ifdef MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785) -#elif defined __HIPCC__ -#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random -#elif defined __CUDACC__ +#if defined __CUDACC__ +#ifndef MGONGPU_HAS_NO_CURAND RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand #else + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on NVidia GPU if build has no curand (PR #784 and #785) +#endif +#elif defined __HIPCC__ +#ifndef MGONGPU_HAS_NO_HIPRAND + RandomNumberMode rndgen = RandomNumberMode::HiprandDevice; // default on AMD GPU if build has hiprand +#else + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on AMD GPU if build has no hiprand +#endif +#else +#ifndef MGONGPU_HAS_NO_CURAND RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand +#elif not defined MGONGPU_HAS_NO_HIPRAND + RandomNumberMode rndgen = RandomNumberMode::HiprandDevice; // default on CPU if build has hiprand +#else + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has neither curand nor hiprand +#endif #endif // Rambo sampling mode (NB RamboHost implies CommonRandom or CurandHost!) enum class RamboSamplingMode @@ -152,7 +166,7 @@ main( int argc, char** argv ) #ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -193,6 +207,26 @@ main( int argc, char** argv ) throw std::runtime_error( "CurandHost is not supported because this application was built without Curand support" ); #else rndgen = RandomNumberMode::CurandHost; +#endif + } + else if( arg == "--hirdev" ) + { +#ifndef __HIPCC__ + throw std::runtime_error( "HiprandDevice is not supported on CPUs or non-AMD GPUs" ); +#elif defined MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "HiprandDevice is not supported because this application was built without Hiprand support" ); +#else + rndgen = RandomNumberMode::HiprandDevice; +#endif + } + else if( arg == "--hirhst" ) + { +#ifdef MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "HiprandHost is not supported because this application was built without Hiprand support" ); +#else + // See https://github.com/ROCm/hipRAND/issues/76 + throw std::runtime_error( "HiprandRandomNumberKernel on host is not supported yet (hiprandCreateGeneratorHost is not implemented yet)" ); + //rndgen = RandomNumberMode::HiprandHost; #endif } else if( arg == "--common" ) @@ -265,6 +299,20 @@ main( int argc, char** argv ) #endif } + if( rmbsmp == RamboSamplingMode::RamboHost && rndgen == RandomNumberMode::HiprandDevice ) + { +#if not defined MGONGPU_HAS_NO_HIPRAND + // See https://github.com/ROCm/hipRAND/issues/76 + //std::cout << "WARNING! RamboHost selected: cannot use HiprandDevice, will use HiprandHost" << std::endl; + //rndgen = RandomNumberMode::HiprandHost; + std::cout << "WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet)" << std::endl; + rndgen = RandomNumberMode::CommonRandom; +#else + std::cout << "WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom" << std::endl; + rndgen = RandomNumberMode::CommonRandom; +#endif + } + constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout constexpr int neppR = MemoryAccessRandomNumbers::neppR; // AOSOA layout @@ -415,7 +463,7 @@ main( int argc, char** argv ) std::unique_ptr wavetimes( new double[niter] ); std::unique_ptr wv3atimes( new double[niter] ); - // --- 0c. Create curand or common generator + // --- 0c. Create curand, hiprand or common generator const std::string cgenKey = "0c GenCreat"; timermap.start( cgenKey ); // Allocate the appropriate RandomNumberKernel @@ -433,7 +481,7 @@ main( int argc, char** argv ) prnk.reset( new CurandRandomNumberKernel( hstRndmom, onDevice ) ); #endif } - else + else if( rndgen == RandomNumberMode::CurandDevice ) { #ifdef MGONGPU_HAS_NO_CURAND throw std::runtime_error( "INTERNAL ERROR! CurandDevice is not supported because this application was built without Curand support" ); // INTERNAL ERROR (no path to this statement) @@ -441,9 +489,31 @@ main( int argc, char** argv ) const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); #else - throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif } + else if( rndgen == RandomNumberMode::HiprandHost ) + { +#ifdef MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "INTERNAL ERROR! HiprandHost is not supported because this application was built without Hiprand support" ); // INTERNAL ERROR (no path to this statement) +#else + const bool onDevice = false; + prnk.reset( new HiprandRandomNumberKernel( hstRndmom, onDevice ) ); +#endif + } + else if( rndgen == RandomNumberMode::HiprandDevice ) + { +#ifdef MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "INTERNAL ERROR! HiprandDevice is not supported because this application was built without Hiprand support" ); // INTERNAL ERROR (no path to this statement) +#elif defined __HIPCC__ + const bool onDevice = true; + prnk.reset( new HiprandRandomNumberKernel( devRndmom, onDevice ) ); +#else + throw std::logic_error( "INTERNAL ERROR! HiprandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) +#endif + } + else + throw std::logic_error( "INTERNAL ERROR! Unknown rndgen value?" ); // INTERNAL ERROR (no path to this statement) // --- 0c. Create rambo sampling kernel [keep this in 0c for the moment] std::unique_ptr prsk; @@ -497,7 +567,7 @@ main( int argc, char** argv ) // *** START THE OLD-STYLE TIMER FOR RANDOM GEN *** double genrtime = 0; - // --- 1a. Seed rnd generator (to get same results on host and device in curand) + // --- 1a. Seed rnd generator (to get same results on host and device in curand/hiprand) // [NB This should not be necessary using the host API: "Generation functions // can be called multiple times on the same generator to generate successive // blocks of results. For pseudorandom generators, multiple calls to generation @@ -515,7 +585,9 @@ main( int argc, char** argv ) //std::cout << "Got random numbers" << std::endl; #ifdef MGONGPUCPP_GPUIMPL - if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) + if( rndgen != RandomNumberMode::CurandDevice && + rndgen != RandomNumberMode::HiprandDevice && + rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device const std::string htodKey = "1c CpHTDrnd"; @@ -761,6 +833,10 @@ main( int argc, char** argv ) rndgentxt = "CURAND HOST"; else if( rndgen == RandomNumberMode::CurandDevice ) rndgentxt = "CURAND DEVICE"; + else if( rndgen == RandomNumberMode::HiprandHost ) + rndgentxt = "ROCRAND HOST"; + else if( rndgen == RandomNumberMode::HiprandDevice ) + rndgentxt = "ROCRAND DEVICE"; #ifdef __CUDACC__ rndgentxt += " (CUDA code)"; #elif defined __HIPCC__ @@ -788,7 +864,7 @@ main( int argc, char** argv ) wrkflwtxt += "FLT+"; #else wrkflwtxt += "???+"; // no path to this statement -#endif /* clang-format on */ +#endif // -- CUCOMPLEX or THRUST or STD or CXSIMPLE complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -813,7 +889,7 @@ main( int argc, char** argv ) wrkflwtxt += "CXS:"; #else wrkflwtxt += "???:"; // no path to this statement -#endif +#endif /* clang-format on */ #endif // -- COMMON or CURAND HOST or CURAND DEVICE random numbers? if( rndgen == RandomNumberMode::CommonRandom ) @@ -822,6 +898,10 @@ main( int argc, char** argv ) wrkflwtxt += "CURHST+"; else if( rndgen == RandomNumberMode::CurandDevice ) wrkflwtxt += "CURDEV+"; + else if( rndgen == RandomNumberMode::HiprandHost ) + wrkflwtxt += "HIRHST+"; + else if( rndgen == RandomNumberMode::HiprandDevice ) + wrkflwtxt += "HIRDEV+"; else wrkflwtxt += "??????+"; // no path to this statement // -- HOST or DEVICE rambo sampling? @@ -1099,7 +1179,7 @@ main( int argc, char** argv ) #ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif - << "\"Curand generation\": " + << "\"Random generation\": " << "\"" << rndgentxt << "\"," << std::endl; double minelem = hstStats.minME; diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/RandomNumberKernels.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/RandomNumberKernels.h index 21d63beeac..7ed728a26c 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/RandomNumberKernels.h +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/RandomNumberKernels.h @@ -1,21 +1,22 @@ -// Copyright (C) 2020-2023 CERN and UCLouvain. +// Copyright (C) 2020-2024 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. #ifndef RANDOMNUMBERKERNELS_H #define RANDOMNUMBERKERNELS_H 1 #include "mgOnGpuConfig.h" -// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when MGONGPUCPP_GPUIMPL is not defined -#ifndef MGONGPU_HAS_NO_CURAND -//#include "curand.h" -struct curandGenerator_st; // forward definition from curand.h -#endif - #include "MemoryBuffers.h" +// Forward definition from curand.h (the full header is only needed in CurandRandomKernel.cc) +struct curandGenerator_st; + +// Forward definition from hiprand.h (the full header is only needed in HiprandRandomKernel.cc) +struct rocrand_generator_base_type; +typedef rocrand_generator_base_type hiprandGenerator_st; + #ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else @@ -107,7 +108,6 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef MGONGPU_HAS_NO_CURAND // A class encapsulating CURAND random number generation on a CPU host or on a GPU device class CurandRandomNumberKernel final : public RandomNumberKernelBase { @@ -142,11 +142,49 @@ namespace mg5amcCpu const bool m_isOnDevice; // The curand generator - // (NB: curand.h defines typedef generator_t as a pointer to forward-defined 'struct curandGenerator_st') + // (NB: curand.h defines typedef curandGenerator_t as a pointer to forward-defined 'struct curandGenerator_st') curandGenerator_st* m_rnGen; }; -#endif + //-------------------------------------------------------------------------- + + // A class encapsulating HIPRAND random number generation on a CPU host or on a GPU device + class HiprandRandomNumberKernel final : public RandomNumberKernelBase + { + public: + + // Constructor from an existing output buffer + HiprandRandomNumberKernel( BufferRndNumMomenta& rnarray, const bool onDevice ); + + // Destructor + ~HiprandRandomNumberKernel(); + + // Seed the random number generator + void seedGenerator( const unsigned int seed ) override final; + + // Generate the random number array + void generateRnarray() override final; + + // Is this a host or device kernel? + bool isOnDevice() const override final { return m_isOnDevice; } + + private: + + // Create the generator (workaround for #429: do this in every seedGenerator call rather than only in the ctor) + void createGenerator(); + + // Destroy the generator (workaround for #429: do this in every seedGenerator call rather than only in the ctor) + void destroyGenerator(); + + private: + + // Is this a host or device kernel? + const bool m_isOnDevice; + + // The hiprand generator + // (NB: hiprand.h defines typedef hiprandGenerator_t as a pointer to forward-defined 'struct hiprandGenerator_st') + hiprandGenerator_st* m_rnGen; + }; //-------------------------------------------------------------------------- } diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk index 1077bdc098..3ad91dfd59 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk @@ -173,11 +173,6 @@ ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) comma:=, CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch)) CUINC = -I$(CUDA_HOME)/include/ - ifeq ($(RNDGEN),hasNoCurand) - CURANDLIBFLAGS= - else - CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here! - endif CUOPTFLAGS = -lineinfo ###GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math GPUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math @@ -241,7 +236,7 @@ else override GPUCC= override USE_NVTX= override CUINC= - override CURANDLIBFLAGS= + override HIPINC= endif @@ -291,7 +286,7 @@ endif #------------------------------------------------------------------------------- -#=== Configure defaults and check if user-defined choices exist for OMPFLAGS, AVX, FPTYPE, HELINL, HRDCOD, RNDGEN +#=== Configure defaults and check if user-defined choices exist for OMPFLAGS, AVX, FPTYPE, HELINL, HRDCOD # Set the default OMPFLAGS choice ifneq ($(findstring hipcc,$(GPUCC)),) @@ -352,29 +347,62 @@ ifeq ($(HRDCOD),) override HRDCOD = 0 endif -# Set the default RNDGEN (random number generator) choice -ifeq ($(RNDGEN),) - ifeq ($(GPUCC),) - override RNDGEN = hasNoCurand - # Edgecase for HIP compilation - else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) - override RNDGEN = hasNoCurand - else ifeq ($(RNDGEN),) - override RNDGEN = hasCurand - endif -endif - -# Export AVX, FPTYPE, HELINL, HRDCOD, RNDGEN, OMPFLAGS so that it is not necessary to pass them to the src Makefile too +# Export AVX, FPTYPE, HELINL, HRDCOD, OMPFLAGS so that it is not necessary to pass them to the src Makefile too export AVX export FPTYPE export HELINL export HRDCOD -export RNDGEN export OMPFLAGS #------------------------------------------------------------------------------- -#=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN +#=== Configure defaults and check if user-defined choices exist for RNDGEN (legacy!), HASCURAND, HASHIPRAND + +# If the legacy RNDGEN exists, this take precedence over any HASCURAND choice (but a warning is printed out) +###$(info RNDGEN=$(RNDGEN)) +ifneq ($(RNDGEN),) + $(warning Environment variable RNDGEN is no longer supported, please use HASCURAND instead!) + ifeq ($(RNDGEN),hasCurand) + override HASCURAND = $(RNDGEN) + else ifeq ($(RNDGEN),hasNoCurand) + override HASCURAND = $(RNDGEN) + else ifneq ($(RNDGEN),hasNoCurand) + $(error Unknown RNDGEN='$(RNDGEN)': only 'hasCurand' and 'hasNoCurand' are supported - but use HASCURAND instead!) + endif +endif + +# Set the default HASCURAND (curand random number generator) choice, if no prior choice exists for HASCURAND +# (NB: allow HASCURAND=hasCurand even if $(GPUCC) does not point to nvcc: assume CUDA_HOME was defined correctly...) +ifeq ($(HASCURAND),) + ifeq ($(GPUCC),) # CPU-only build + override HASCURAND = hasNoCurand + else ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build + override HASCURAND = hasCurand + else # non-Nvidia GPU build + override HASCURAND = hasNoCurand + endif +endif + +# Set the default HASHIPRAND (hiprand random number generator) choice, if no prior choice exists for HASHIPRAND +# (NB: allow HASHIPRAND=hasHiprand even if $(GPUCC) does not point to hipcc: assume HIP_HOME was defined correctly...) +ifeq ($(HASHIPRAND),) + ifeq ($(GPUCC),) # CPU-only build + override HASHIPRAND = hasNoHiprand + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build + override HASHIPRAND = hasHiprand + else # non-AMD GPU build + override HASHIPRAND = hasNoHiprand + endif +endif + +# Export HASCURAND, HASHIPRAND so that it is not necessary to pass them to the src Makefile too +# (NB: these variables in cudacpp_src.mk are only used to define the build tag, they are NOT needed for RNDCXXFLAGS or RNDLIBFLAGS) +export HASCURAND +export HASHIPRAND + +#------------------------------------------------------------------------------- + +#=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD # Set the build flags appropriate to OMPFLAGS $(info OMPFLAGS=$(OMPFLAGS)) @@ -432,13 +460,13 @@ CXXFLAGS+= $(AVXFLAGS) $(info FPTYPE=$(FPTYPE)) ifeq ($(FPTYPE),d) CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE - GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE + GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE else ifeq ($(FPTYPE),f) CXXFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT - GPUFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT + GPUFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT else ifeq ($(FPTYPE),m) CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT - GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT + GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT else $(error Unknown FPTYPE='$(FPTYPE)': only 'd', 'f' and 'm' are supported) endif @@ -447,7 +475,7 @@ endif $(info HELINL=$(HELINL)) ifeq ($(HELINL),1) CXXFLAGS += -DMGONGPU_INLINE_HELAMPS - GPUFLAGS += -DMGONGPU_INLINE_HELAMPS + GPUFLAGS += -DMGONGPU_INLINE_HELAMPS else ifneq ($(HELINL),0) $(error Unknown HELINL='$(HELINL)': only '0' and '1' are supported) endif @@ -456,21 +484,40 @@ endif $(info HRDCOD=$(HRDCOD)) ifeq ($(HRDCOD),1) CXXFLAGS += -DMGONGPU_HARDCODE_PARAM - GPUFLAGS += -DMGONGPU_HARDCODE_PARAM + GPUFLAGS += -DMGONGPU_HARDCODE_PARAM else ifneq ($(HRDCOD),0) $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported) endif -# Set the build flags appropriate to each RNDGEN choice (example: "make RNDGEN=hasNoCurand") -$(info RNDGEN=$(RNDGEN)) -ifeq ($(RNDGEN),hasNoCurand) - override CXXFLAGSCURAND = -DMGONGPU_HAS_NO_CURAND -else ifeq ($(RNDGEN),hasCurand) - override CXXFLAGSCURAND = + +#=== Set the CUDA/HIP/C++ compiler and linker flags appropriate to user-defined choices of HASCURAND, HASHIPRAND + +$(info HASCURAND=$(HASCURAND)) +$(info HASHIPRAND=$(HASHIPRAND)) +override RNDCXXFLAGS= +override RNDLIBFLAGS= + +# Set the RNDCXXFLAGS and RNDLIBFLAGS build flags appropriate to each HASCURAND choice (example: "make HASCURAND=hasNoCurand") +ifeq ($(HASCURAND),hasNoCurand) + override RNDCXXFLAGS += -DMGONGPU_HAS_NO_CURAND +else ifeq ($(HASCURAND),hasCurand) + override RNDLIBFLAGS += -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here! else - $(error Unknown RNDGEN='$(RNDGEN)': only 'hasCurand' and 'hasNoCurand' are supported) + $(error Unknown HASCURAND='$(HASCURAND)': only 'hasCurand' and 'hasNoCurand' are supported) +endif + +# Set the RNDCXXFLAGS and RNDLIBFLAGS build flags appropriate to each HASHIPRAND choice (example: "make HASHIPRAND=hasNoHiprand") +ifeq ($(HASHIPRAND),hasNoHiprand) + override RNDCXXFLAGS += -DMGONGPU_HAS_NO_HIPRAND +else ifeq ($(HASHIPRAND),hasHiprand) + override RNDLIBFLAGS += -L$(HIP_HOME)/lib/ -lhiprand +else ifneq ($(HASHIPRAND),hasHiprand) + $(error Unknown HASHIPRAND='$(HASHIPRAND)': only 'hasHiprand' and 'hasNoHiprand' are supported) endif +#$(info RNDCXXFLAGS=$(RNDCXXFLAGS)) +#$(info HASHIPRAND=$(HASHIPRAND)) + #------------------------------------------------------------------------------- #=== Configure build directories and build lockfiles === @@ -481,7 +528,7 @@ override DIRTAG = $(AVX)_$(FPTYPE)_inl$(HELINL)_hrd$(HRDCOD) # Build lockfile "full" tag (defines full specification of build options that cannot be intermixed) # (Rationale: avoid mixing of CUDA and no-CUDA environment builds with different random number generators) -override TAG = $(AVX)_$(FPTYPE)_inl$(HELINL)_hrd$(HRDCOD)_$(RNDGEN) +override TAG = $(AVX)_$(FPTYPE)_inl$(HELINL)_hrd$(HRDCOD)_$(HASCURAND)_$(HASHIPRAND) # Build directory: current directory by default, or build.$(DIRTAG) if USEBUILDDIR==1 ifeq ($(USEBUILDDIR),1) @@ -589,14 +636,19 @@ endif $(BUILDDIR)/check_sa.o: CXXFLAGS += $(USE_NVTX) $(CUINC) $(BUILDDIR)/check_sa_cu.o: CXXFLAGS += $(USE_NVTX) $(CUINC) -# Apply special build flags only to check_sa[_cu].o and CurandRandomNumberKernel[_cu].o (curand headers, #679) -$(BUILDDIR)/check_sa.o: CXXFLAGS += $(CXXFLAGSCURAND) -$(BUILDDIR)/check_sa_cu.o: CUFLAGS += $(CXXFLAGSCURAND) -$(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CXXFLAGSCURAND) -$(BUILDDIR)/CurandRandomNumberKernel_cu.o: CUFLAGS += $(CXXFLAGSCURAND) -ifeq ($(RNDGEN),hasCurand) +# Apply special build flags only to check_sa[_cu].o and (Cu|Hip)randRandomNumberKernel[_cu].o +$(BUILDDIR)/check_sa.o: CXXFLAGS += $(RNDCXXFLAGS) +$(BUILDDIR)/check_sa_cu.o: CUFLAGS += $(RNDCXXFLAGS) +$(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(RNDCXXFLAGS) +$(BUILDDIR)/CurandRandomNumberKernel_cu.o: CUFLAGS += $(RNDCXXFLAGS) +$(BUILDDIR)/HiprandRandomNumberKernel.o: CXXFLAGS += $(RNDCXXFLAGS) +$(BUILDDIR)/HiprandRandomNumberKernel_cu.o: CUFLAGS += $(RNDCXXFLAGS) +ifeq ($(HASCURAND),hasCurand) # curand headers, #679 $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUINC) endif +ifeq ($(HASHIPRAND),hasHiprand) # hiprand headers +$(BUILDDIR)/HiprandRandomNumberKernel.o: CXXFLAGS += $(HIPINC) +endif # Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592) ifneq ($(shell $(CXX) --version | egrep '^(Intel)'),) @@ -673,8 +725,8 @@ endif # Target (and build rules): C++ and CUDA standalone executables $(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o - $(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CURANDLIBFLAGS) +$(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(BUILDDIR)/HiprandRandomNumberKernel.o + $(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(BUILDDIR)/HiprandRandomNumberKernel.o $(RNDLIBFLAGS) ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) @@ -684,8 +736,8 @@ else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(cu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif $(cu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(cu_main): $(BUILDDIR)/check_sa_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_cu.o - $(GPUCC) -o $@ $(BUILDDIR)/check_sa_cu.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_cu.o $(CURANDLIBFLAGS) +$(cu_main): $(BUILDDIR)/check_sa_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_cu.o $(BUILDDIR)/HiprandRandomNumberKernel_cu.o + $(GPUCC) -o $@ $(BUILDDIR)/check_sa_cu.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_cu.o $(BUILDDIR)/HiprandRandomNumberKernel_cu.o $(RNDLIBFLAGS) endif #------------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/runTest.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/runTest.cc index de327f2321..7f248d29a4 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/runTest.cc +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/runTest.cc @@ -238,7 +238,7 @@ struct CUDATest : public CUDA_CPU_TestBase return MemoryAccessMatrixElements::ieventAccessConst( hstMatrixElements.data(), ievt ); } }; -#endif +#endif /* clang-format off */ // Use two levels of macros to force stringification at the right level // (see https://gcc.gnu.org/onlinedocs/gcc-3.0.1/cpp_3.html#SEC17 and https://stackoverflow.com/a/3419392) @@ -260,4 +260,4 @@ INSTANTIATE_TEST_SUITE_P( prefix, \ MG_INSTANTIATE_TEST_SUITE_GPU( XTESTID_GPU( MG_EPOCH_PROCESS_ID ), MadgraphTest ); #else MG_INSTANTIATE_TEST_SUITE_CPU( XTESTID_CPU( MG_EPOCH_PROCESS_ID ), MadgraphTest ); -#endif +#endif /* clang-format on */ diff --git a/epochX/cudacpp/gg_ttg.sa/src/cudacpp_src.mk b/epochX/cudacpp/gg_ttg.sa/src/cudacpp_src.mk index b2b9da5288..d7d51259d2 100644 --- a/epochX/cudacpp/gg_ttg.sa/src/cudacpp_src.mk +++ b/epochX/cudacpp/gg_ttg.sa/src/cudacpp_src.mk @@ -45,13 +45,13 @@ endif #------------------------------------------------------------------------------- -#=== Configure the CUDA compiler (note: GPUCC is already exported including ccache) +#=== Configure the CUDA compiler (note: GPUCC have been exported from cudacpp.mk including ccache) ###$(info GPUCC=$(GPUCC)) #------------------------------------------------------------------------------- -#=== Configure ccache for C++ builds (note: GPUCC is already exported including ccache) +#=== Configure ccache for C++ builds (note: GPUCC have been exported from cudacpp.mk including ccache) # Enable ccache if USECCACHE=1 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) @@ -86,7 +86,8 @@ endif #------------------------------------------------------------------------------- -#=== Set the CUDA/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN +#=== Set the CUDA/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD (exported from cudacpp.mk) +#=== (NB the RNDCXXFLAGS and RNDLIBFLAGS appropriate to user-defined choices of HASCURAND and HASHIPRAND have been exported from cudacpp.mk) # Set the build flags appropriate to OMPFLAGS ###$(info OMPFLAGS=$(OMPFLAGS)) @@ -175,14 +176,6 @@ else ifneq ($(HRDCOD),0) $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported) endif -# Set the build flags appropriate to each RNDGEN choice (example: "make RNDGEN=hasNoCurand") -###$(info RNDGEN=$(RNDGEN)) -ifeq ($(RNDGEN),hasNoCurand) - CXXFLAGS += -DMGONGPU_HAS_NO_CURAND -else ifneq ($(RNDGEN),hasCurand) - $(error Unknown RNDGEN='$(RNDGEN)': only 'hasCurand' and 'hasNoCurand' are supported) -endif - #------------------------------------------------------------------------------- #=== Configure build directories and build lockfiles === @@ -193,7 +186,7 @@ override DIRTAG = $(AVX)_$(FPTYPE)_inl$(HELINL)_hrd$(HRDCOD) # Build lockfile "full" tag (defines full specification of build options that cannot be intermixed) # (Rationale: avoid mixing of CUDA and no-CUDA environment builds with different random number generators) -override TAG = $(AVX)_$(FPTYPE)_inl$(HELINL)_hrd$(HRDCOD)_$(RNDGEN) +override TAG = $(AVX)_$(FPTYPE)_inl$(HELINL)_hrd$(HRDCOD)_$(HASCURAND)_$(HASHIPRAND) # Build directory: current directory by default, or build.$(DIRTAG) if USEBUILDDIR==1 ###$(info Current directory is $(shell pwd)) diff --git a/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuConfig.h index 475749ca7c..22d6921fda 100644 --- a/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuConfig.h @@ -20,11 +20,15 @@ #undef MGONGPUCPP_GPUIMPL #endif +// Make sure that __HIP_PLATFORM_NVIDIA__ is undefined +// (__HIP_PLATFORM_AMD__ is defined by hipcc or in HiprandRandomNumberKernel.cc) +#undef __HIP_PLATFORM_NVIDIA__ // disable hiprand for NVidia (curand) + // ** NB1 Throughputs (e.g. 6.8E8) are events/sec for "./gcheck.exe -p 65536 128 12" // ** NB2 Baseline on b7g47n0004 fluctuates (probably depends on load on other VMs) // Choose if curand is supported for generating random numbers -// For HIP, by default, do not use curand (common random numbers will be used instead) +// For HIP, by default, do not allow curand to be used (hiprand or common random numbers will be used instead) // For both CUDA and C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND // (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784 and #785) #if defined __HIPCC__ @@ -39,6 +43,22 @@ //#endif #endif +// Choose if hiprand is supported for generating random numbers +// For CUDA, by default, do not allow hiprand to be used (curand or common random numbers will be used instead) +// For both HIP and C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_HIPRAND +// (there may exist HIP installations which do not include hiprand?) +#if defined __CUDACC__ +#define MGONGPU_HAS_NO_HIPRAND 1 +#else +//#ifdef __HIPCC__ +//#undef MGONGPU_HAS_NO_HIPRAND // default +////#define MGONGPU_HAS_NO_HIPRAND 1 +//#else +//#undef MGONGPU_HAS_NO_HIPRAND // default +////#define MGONGPU_HAS_NO_HIPRAND 1 +//#endif +#endif + // Choose floating point precision (for everything but color algebra #537) // If one of these macros has been set from outside with e.g. -DMGONGPU_FPTYPE_FLOAT, nothing happens (issue #167) #if not defined MGONGPU_FPTYPE_DOUBLE and not defined MGONGPU_FPTYPE_FLOAT diff --git a/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt b/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt index 256a6fc6ee..8b11d9e97b 100644 --- a/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt +++ b/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ g g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005756378173828125  +DEBUG: model prefixing takes 0.005313873291015625  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=4: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g g WEIGHTED<=4 @1 INFO: Process has 123 diagrams -1 processes with 123 diagrams generated in 0.161 s +1 processes with 123 diagrams generated in 0.157 s Total: 1 processes with 123 diagrams output madevent ../TMPOUT/CODEGEN_mad_gg_ttgg --hel_recycling=False --vector_size=32 --me_exporter=standalone_cudacpp Load PLUGIN.CUDACPP_OUTPUT @@ -164,10 +164,10 @@ It has been validated for the last time with version: 3.5.2 Addition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT Output will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: cformat =  standalone_cudacpp [export_cpp.py at line 3071]  -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 160]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 161]  INFO: initialize a new directory: CODEGEN_mad_gg_ttgg INFO: remove old information in CODEGEN_mad_gg_ttgg -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 165]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 166]  WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg  INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards  @@ -177,7 +177,7 @@ INFO: Generating Helas calls for process: g g > t t~ g g WEIGHTED<=4 @1 INFO: Processing color information for process: g g > t t~ g g @1 INFO: Creating files in directory P1_gg_ttxgg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1057]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -192,23 +192,23 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1871]  INFO: Generating Feynman diagrams for Process: g g > t t~ g g WEIGHTED<=4 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttxgg -Generated helas calls for 1 subprocesses (123 diagrams) in 0.432 s -Wrote files for 222 helas calls in 0.707 s +Generated helas calls for 1 subprocesses (123 diagrams) in 0.423 s +Wrote files for 222 helas calls in 0.700 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.336 s -DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202]  +ALOHA: aloha creates 5 routines in 0.327 s +DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 203]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 10 routines in 0.316 s +ALOHA: aloha creates 10 routines in 0.310 s VVV1 VVV1 FFV1 @@ -250,17 +250,17 @@ Hunk #2 succeeded at 191 (offset 48 lines). Hunk #3 succeeded at 269 (offset 48 lines). Hunk #4 succeeded at 297 (offset 48 lines). Hunk #5 succeeded at 342 (offset 48 lines). -DEBUG: p.returncode =  0 [output.py at line 237]  +DEBUG: p.returncode =  0 [output.py at line 238]  Output to directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg done. Type "launch" to generate events from this process, or see /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/README Run "open index.html" to see more information about this process. quit -real 0m3.990s -user 0m3.058s -sys 0m0.274s -Code generation completed in 4 seconds +real 0m3.327s +user 0m3.009s +sys 0m0.283s +Code generation completed in 3 seconds ************************************************************ * * * W E L C O M E to * diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/CurandRandomNumberKernel.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/CurandRandomNumberKernel.cc index 08a16f6f2c..c160c5e06b 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/CurandRandomNumberKernel.cc +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/CurandRandomNumberKernel.cc @@ -10,6 +10,7 @@ #include #ifndef MGONGPU_HAS_NO_CURAND /* clang-format off */ +// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when MGONGPUCPP_GPUIMPL is not defined #include "curand.h" #define checkCurand( code ){ assertCurand( code, __FILE__, __LINE__ ); } inline void assertCurand( curandStatus_t code, const char *file, int line, bool abort = true ) diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/HiprandRandomNumberKernel.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/HiprandRandomNumberKernel.cc new file mode 100644 index 0000000000..2e4534f9d4 --- /dev/null +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/HiprandRandomNumberKernel.cc @@ -0,0 +1,145 @@ +// Copyright (C) 2020-2024 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Jan 2024) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2024) for the MG5aMC CUDACPP plugin. + +#include "mgOnGpuConfig.h" + +#include "GpuRuntime.h" +#include "MemoryBuffers.h" +#include "RandomNumberKernels.h" + +#include + +#ifndef MGONGPU_HAS_NO_HIPRAND /* clang-format off */ +#ifndef __HIP_PLATFORM_AMD__ +#define __HIP_PLATFORM_AMD__ 1 // enable hiprand for AMD (rocrand) +#endif +#include +#define checkHiprand( code ){ assertHiprand( code, __FILE__, __LINE__ ); } +inline void assertHiprand( hiprandStatus_t code, const char *file, int line, bool abort = true ) +{ + if ( code != HIPRAND_STATUS_SUCCESS ) + { + printf( "HiprandAssert: %s:%d code=%d\n", file, line, code ); + if ( abort ) assert( code == HIPRAND_STATUS_SUCCESS ); + } +} +#endif /* clang-format on */ + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + //-------------------------------------------------------------------------- +#ifndef MGONGPU_HAS_NO_HIPRAND + HiprandRandomNumberKernel::HiprandRandomNumberKernel( BufferRndNumMomenta& rnarray, const bool onDevice ) + : RandomNumberKernelBase( rnarray ) + , m_isOnDevice( onDevice ) + { + if( m_isOnDevice ) + { +#ifdef MGONGPUCPP_GPUIMPL + if( !m_rnarray.isOnDevice() ) + throw std::runtime_error( "HiprandRandomNumberKernel on device with a host random number array" ); +#else + throw std::runtime_error( "HiprandRandomNumberKernel does not support HiprandDevice on CPU host" ); +#endif + } + else + { + if( m_rnarray.isOnDevice() ) + throw std::runtime_error( "HiprandRandomNumberKernel on host with a device random number array" ); + } + createGenerator(); + } + + //-------------------------------------------------------------------------- + + HiprandRandomNumberKernel::~HiprandRandomNumberKernel() + { + destroyGenerator(); + } + + //-------------------------------------------------------------------------- + + void HiprandRandomNumberKernel::seedGenerator( const unsigned int seed ) + { + if( m_isOnDevice ) + { + destroyGenerator(); // workaround for #429 + createGenerator(); // workaround for #429 + } + //printf( "seedGenerator: seed %d\n", seed ); + checkHiprand( hiprandSetPseudoRandomGeneratorSeed( m_rnGen, seed ) ); + } + + //-------------------------------------------------------------------------- + + void HiprandRandomNumberKernel::createGenerator() + { + //const hiprandRngType_t type = HIPRAND_RNG_PSEUDO_DEFAULT; + //const hiprandRngType_t type = HIPRAND_RNG_PSEUDO_XORWOW; + //const hiprandRngType_t type = HIPRAND_RNG_PSEUDO_MRG32K3A; + const hiprandRngType_t type = HIPRAND_RNG_PSEUDO_MTGP32; // same as curand; not implemented yet (code=1000) in host code + //const hiprandRngType_t type = HIPRAND_RNG_PSEUDO_MT19937; + //const hiprandRngType_t type = HIPRAND_RNG_PSEUDO_PHILOX4_32_10; + if( m_isOnDevice ) + { + checkHiprand( hiprandCreateGenerator( &m_rnGen, type ) ); + } + else + { + // See https://github.com/ROCm/hipRAND/issues/76 + throw std::runtime_error( "HiprandRandomNumberKernel on host is not supported yet (hiprandCreateGeneratorHost is not implemented yet)" ); + //checkHiprand( hiprandCreateGeneratorHost( &m_rnGen, type ) ); // ALWAYS FAILS WITH CODE=1000 + } + // FIXME: hiprand ordering is not implemented yet + // See https://github.com/ROCm/hipRAND/issues/75 + /* + //checkHiprand( hiprandSetGeneratorOrdering( *&m_rnGen, HIPRAND_ORDERING_PSEUDO_LEGACY ) ); + checkHiprand( hiprandSetGeneratorOrdering( *&m_rnGen, HIPRAND_ORDERING_PSEUDO_BEST ) ); + //checkHiprand( hiprandSetGeneratorOrdering( *&m_rnGen, HIPRAND_ORDERING_PSEUDO_DYNAMIC ) ); + //checkHiprand( hiprandSetGeneratorOrdering( *&m_rnGen, HIPRAND_ORDERING_PSEUDO_SEEDED ) ); + */ + } + + //-------------------------------------------------------------------------- + + void HiprandRandomNumberKernel::destroyGenerator() + { + checkHiprand( hiprandDestroyGenerator( m_rnGen ) ); + } + + //-------------------------------------------------------------------------- + + void HiprandRandomNumberKernel::generateRnarray() + { +#if defined MGONGPU_FPTYPE_DOUBLE + checkHiprand( hiprandGenerateUniformDouble( m_rnGen, m_rnarray.data(), m_rnarray.size() ) ); +#elif defined MGONGPU_FPTYPE_FLOAT + checkHiprand( hiprandGenerateUniform( m_rnGen, m_rnarray.data(), m_rnarray.size() ) ); +#endif + /* + printf( "\nHiprandRandomNumberKernel::generateRnarray size = %d\n", (int)m_rnarray.size() ); + fptype* data = m_rnarray.data(); +#ifdef MGONGPUCPP_GPUIMPL + if( m_rnarray.isOnDevice() ) + { + data = new fptype[m_rnarray.size()](); + checkCuda( cudaMemcpy( data, m_rnarray.data(), m_rnarray.bytes(), cudaMemcpyDeviceToHost ) ); + } +#endif + for( int i = 0; i < ( (int)m_rnarray.size() / 4 ); i++ ) + printf( "[%4d] %f %f %f %f\n", i * 4, data[i * 4], data[i * 4 + 2], data[i * 4 + 2], data[i * 4 + 3] ); +#ifdef MGONGPUCPP_GPUIMPL + if( m_rnarray.isOnDevice() ) delete[] data; +#endif + */ + } + + //-------------------------------------------------------------------------- +#endif +} diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/HiprandRandomNumberKernel.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/HiprandRandomNumberKernel.cc new file mode 120000 index 0000000000..6691864f78 --- /dev/null +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/HiprandRandomNumberKernel.cc @@ -0,0 +1 @@ +../HiprandRandomNumberKernel.cc \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/check_sa.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/check_sa.cc index bde384c69e..e086ae12d9 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/check_sa.cc +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/check_sa.cc @@ -1,10 +1,10 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2023 CERN and UCLouvain. +// Copyright (C) 2020-2024 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -58,7 +58,7 @@ int usage( char* argv0, int ret = 1 ) { std::cout << "Usage: " << argv0 - << " [--verbose|-v] [--debug|-d] [--performance|-p] [--json|-j] [--curhst|--curdev|--common] [--rmbhst|--rmbdev] [--bridge]" + << " [--verbose|-v] [--debug|-d] [--performance|-p] [--json|-j] [--curhst|--curdev|--hirhst|--hirdev|--common] [--rmbhst|--rmbdev] [--bridge]" << " [#gpuBlocksPerGrid #gpuThreadsPerBlock] #iterations" << std::endl; std::cout << std::endl; std::cout << "The number of events per iteration is #gpuBlocksPerGrid * #gpuThreadsPerBlock" << std::endl; @@ -131,17 +131,31 @@ main( int argc, char** argv ) enum class RandomNumberMode { CommonRandom = 0, - CurandHost = 1, - CurandDevice = 2 + CurandHost = -1, + CurandDevice = 1, + HiprandHost = -2, + HiprandDevice = 2 }; -#ifdef MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785) -#elif defined __HIPCC__ -#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random -#elif defined __CUDACC__ +#if defined __CUDACC__ +#ifndef MGONGPU_HAS_NO_CURAND RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand #else + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on NVidia GPU if build has no curand (PR #784 and #785) +#endif +#elif defined __HIPCC__ +#ifndef MGONGPU_HAS_NO_HIPRAND + RandomNumberMode rndgen = RandomNumberMode::HiprandDevice; // default on AMD GPU if build has hiprand +#else + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on AMD GPU if build has no hiprand +#endif +#else +#ifndef MGONGPU_HAS_NO_CURAND RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand +#elif not defined MGONGPU_HAS_NO_HIPRAND + RandomNumberMode rndgen = RandomNumberMode::HiprandDevice; // default on CPU if build has hiprand +#else + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has neither curand nor hiprand +#endif #endif // Rambo sampling mode (NB RamboHost implies CommonRandom or CurandHost!) enum class RamboSamplingMode @@ -152,7 +166,7 @@ main( int argc, char** argv ) #ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -193,6 +207,26 @@ main( int argc, char** argv ) throw std::runtime_error( "CurandHost is not supported because this application was built without Curand support" ); #else rndgen = RandomNumberMode::CurandHost; +#endif + } + else if( arg == "--hirdev" ) + { +#ifndef __HIPCC__ + throw std::runtime_error( "HiprandDevice is not supported on CPUs or non-AMD GPUs" ); +#elif defined MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "HiprandDevice is not supported because this application was built without Hiprand support" ); +#else + rndgen = RandomNumberMode::HiprandDevice; +#endif + } + else if( arg == "--hirhst" ) + { +#ifdef MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "HiprandHost is not supported because this application was built without Hiprand support" ); +#else + // See https://github.com/ROCm/hipRAND/issues/76 + throw std::runtime_error( "HiprandRandomNumberKernel on host is not supported yet (hiprandCreateGeneratorHost is not implemented yet)" ); + //rndgen = RandomNumberMode::HiprandHost; #endif } else if( arg == "--common" ) @@ -265,6 +299,20 @@ main( int argc, char** argv ) #endif } + if( rmbsmp == RamboSamplingMode::RamboHost && rndgen == RandomNumberMode::HiprandDevice ) + { +#if not defined MGONGPU_HAS_NO_HIPRAND + // See https://github.com/ROCm/hipRAND/issues/76 + //std::cout << "WARNING! RamboHost selected: cannot use HiprandDevice, will use HiprandHost" << std::endl; + //rndgen = RandomNumberMode::HiprandHost; + std::cout << "WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet)" << std::endl; + rndgen = RandomNumberMode::CommonRandom; +#else + std::cout << "WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom" << std::endl; + rndgen = RandomNumberMode::CommonRandom; +#endif + } + constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout constexpr int neppR = MemoryAccessRandomNumbers::neppR; // AOSOA layout @@ -415,7 +463,7 @@ main( int argc, char** argv ) std::unique_ptr wavetimes( new double[niter] ); std::unique_ptr wv3atimes( new double[niter] ); - // --- 0c. Create curand or common generator + // --- 0c. Create curand, hiprand or common generator const std::string cgenKey = "0c GenCreat"; timermap.start( cgenKey ); // Allocate the appropriate RandomNumberKernel @@ -433,7 +481,7 @@ main( int argc, char** argv ) prnk.reset( new CurandRandomNumberKernel( hstRndmom, onDevice ) ); #endif } - else + else if( rndgen == RandomNumberMode::CurandDevice ) { #ifdef MGONGPU_HAS_NO_CURAND throw std::runtime_error( "INTERNAL ERROR! CurandDevice is not supported because this application was built without Curand support" ); // INTERNAL ERROR (no path to this statement) @@ -441,9 +489,31 @@ main( int argc, char** argv ) const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); #else - throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif } + else if( rndgen == RandomNumberMode::HiprandHost ) + { +#ifdef MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "INTERNAL ERROR! HiprandHost is not supported because this application was built without Hiprand support" ); // INTERNAL ERROR (no path to this statement) +#else + const bool onDevice = false; + prnk.reset( new HiprandRandomNumberKernel( hstRndmom, onDevice ) ); +#endif + } + else if( rndgen == RandomNumberMode::HiprandDevice ) + { +#ifdef MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "INTERNAL ERROR! HiprandDevice is not supported because this application was built without Hiprand support" ); // INTERNAL ERROR (no path to this statement) +#elif defined __HIPCC__ + const bool onDevice = true; + prnk.reset( new HiprandRandomNumberKernel( devRndmom, onDevice ) ); +#else + throw std::logic_error( "INTERNAL ERROR! HiprandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) +#endif + } + else + throw std::logic_error( "INTERNAL ERROR! Unknown rndgen value?" ); // INTERNAL ERROR (no path to this statement) // --- 0c. Create rambo sampling kernel [keep this in 0c for the moment] std::unique_ptr prsk; @@ -497,7 +567,7 @@ main( int argc, char** argv ) // *** START THE OLD-STYLE TIMER FOR RANDOM GEN *** double genrtime = 0; - // --- 1a. Seed rnd generator (to get same results on host and device in curand) + // --- 1a. Seed rnd generator (to get same results on host and device in curand/hiprand) // [NB This should not be necessary using the host API: "Generation functions // can be called multiple times on the same generator to generate successive // blocks of results. For pseudorandom generators, multiple calls to generation @@ -515,7 +585,9 @@ main( int argc, char** argv ) //std::cout << "Got random numbers" << std::endl; #ifdef MGONGPUCPP_GPUIMPL - if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) + if( rndgen != RandomNumberMode::CurandDevice && + rndgen != RandomNumberMode::HiprandDevice && + rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device const std::string htodKey = "1c CpHTDrnd"; @@ -761,6 +833,10 @@ main( int argc, char** argv ) rndgentxt = "CURAND HOST"; else if( rndgen == RandomNumberMode::CurandDevice ) rndgentxt = "CURAND DEVICE"; + else if( rndgen == RandomNumberMode::HiprandHost ) + rndgentxt = "ROCRAND HOST"; + else if( rndgen == RandomNumberMode::HiprandDevice ) + rndgentxt = "ROCRAND DEVICE"; #ifdef __CUDACC__ rndgentxt += " (CUDA code)"; #elif defined __HIPCC__ @@ -788,7 +864,7 @@ main( int argc, char** argv ) wrkflwtxt += "FLT+"; #else wrkflwtxt += "???+"; // no path to this statement -#endif /* clang-format on */ +#endif // -- CUCOMPLEX or THRUST or STD or CXSIMPLE complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -813,7 +889,7 @@ main( int argc, char** argv ) wrkflwtxt += "CXS:"; #else wrkflwtxt += "???:"; // no path to this statement -#endif +#endif /* clang-format on */ #endif // -- COMMON or CURAND HOST or CURAND DEVICE random numbers? if( rndgen == RandomNumberMode::CommonRandom ) @@ -822,6 +898,10 @@ main( int argc, char** argv ) wrkflwtxt += "CURHST+"; else if( rndgen == RandomNumberMode::CurandDevice ) wrkflwtxt += "CURDEV+"; + else if( rndgen == RandomNumberMode::HiprandHost ) + wrkflwtxt += "HIRHST+"; + else if( rndgen == RandomNumberMode::HiprandDevice ) + wrkflwtxt += "HIRDEV+"; else wrkflwtxt += "??????+"; // no path to this statement // -- HOST or DEVICE rambo sampling? @@ -1099,7 +1179,7 @@ main( int argc, char** argv ) #ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif - << "\"Curand generation\": " + << "\"Random generation\": " << "\"" << rndgentxt << "\"," << std::endl; double minelem = hstStats.minME; diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/RandomNumberKernels.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/RandomNumberKernels.h index 21d63beeac..7ed728a26c 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/RandomNumberKernels.h +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/RandomNumberKernels.h @@ -1,21 +1,22 @@ -// Copyright (C) 2020-2023 CERN and UCLouvain. +// Copyright (C) 2020-2024 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. #ifndef RANDOMNUMBERKERNELS_H #define RANDOMNUMBERKERNELS_H 1 #include "mgOnGpuConfig.h" -// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when MGONGPUCPP_GPUIMPL is not defined -#ifndef MGONGPU_HAS_NO_CURAND -//#include "curand.h" -struct curandGenerator_st; // forward definition from curand.h -#endif - #include "MemoryBuffers.h" +// Forward definition from curand.h (the full header is only needed in CurandRandomKernel.cc) +struct curandGenerator_st; + +// Forward definition from hiprand.h (the full header is only needed in HiprandRandomKernel.cc) +struct rocrand_generator_base_type; +typedef rocrand_generator_base_type hiprandGenerator_st; + #ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else @@ -107,7 +108,6 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef MGONGPU_HAS_NO_CURAND // A class encapsulating CURAND random number generation on a CPU host or on a GPU device class CurandRandomNumberKernel final : public RandomNumberKernelBase { @@ -142,11 +142,49 @@ namespace mg5amcCpu const bool m_isOnDevice; // The curand generator - // (NB: curand.h defines typedef generator_t as a pointer to forward-defined 'struct curandGenerator_st') + // (NB: curand.h defines typedef curandGenerator_t as a pointer to forward-defined 'struct curandGenerator_st') curandGenerator_st* m_rnGen; }; -#endif + //-------------------------------------------------------------------------- + + // A class encapsulating HIPRAND random number generation on a CPU host or on a GPU device + class HiprandRandomNumberKernel final : public RandomNumberKernelBase + { + public: + + // Constructor from an existing output buffer + HiprandRandomNumberKernel( BufferRndNumMomenta& rnarray, const bool onDevice ); + + // Destructor + ~HiprandRandomNumberKernel(); + + // Seed the random number generator + void seedGenerator( const unsigned int seed ) override final; + + // Generate the random number array + void generateRnarray() override final; + + // Is this a host or device kernel? + bool isOnDevice() const override final { return m_isOnDevice; } + + private: + + // Create the generator (workaround for #429: do this in every seedGenerator call rather than only in the ctor) + void createGenerator(); + + // Destroy the generator (workaround for #429: do this in every seedGenerator call rather than only in the ctor) + void destroyGenerator(); + + private: + + // Is this a host or device kernel? + const bool m_isOnDevice; + + // The hiprand generator + // (NB: hiprand.h defines typedef hiprandGenerator_t as a pointer to forward-defined 'struct hiprandGenerator_st') + hiprandGenerator_st* m_rnGen; + }; //-------------------------------------------------------------------------- } diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk index 1077bdc098..3ad91dfd59 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk @@ -173,11 +173,6 @@ ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) comma:=, CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch)) CUINC = -I$(CUDA_HOME)/include/ - ifeq ($(RNDGEN),hasNoCurand) - CURANDLIBFLAGS= - else - CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here! - endif CUOPTFLAGS = -lineinfo ###GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math GPUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math @@ -241,7 +236,7 @@ else override GPUCC= override USE_NVTX= override CUINC= - override CURANDLIBFLAGS= + override HIPINC= endif @@ -291,7 +286,7 @@ endif #------------------------------------------------------------------------------- -#=== Configure defaults and check if user-defined choices exist for OMPFLAGS, AVX, FPTYPE, HELINL, HRDCOD, RNDGEN +#=== Configure defaults and check if user-defined choices exist for OMPFLAGS, AVX, FPTYPE, HELINL, HRDCOD # Set the default OMPFLAGS choice ifneq ($(findstring hipcc,$(GPUCC)),) @@ -352,29 +347,62 @@ ifeq ($(HRDCOD),) override HRDCOD = 0 endif -# Set the default RNDGEN (random number generator) choice -ifeq ($(RNDGEN),) - ifeq ($(GPUCC),) - override RNDGEN = hasNoCurand - # Edgecase for HIP compilation - else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) - override RNDGEN = hasNoCurand - else ifeq ($(RNDGEN),) - override RNDGEN = hasCurand - endif -endif - -# Export AVX, FPTYPE, HELINL, HRDCOD, RNDGEN, OMPFLAGS so that it is not necessary to pass them to the src Makefile too +# Export AVX, FPTYPE, HELINL, HRDCOD, OMPFLAGS so that it is not necessary to pass them to the src Makefile too export AVX export FPTYPE export HELINL export HRDCOD -export RNDGEN export OMPFLAGS #------------------------------------------------------------------------------- -#=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN +#=== Configure defaults and check if user-defined choices exist for RNDGEN (legacy!), HASCURAND, HASHIPRAND + +# If the legacy RNDGEN exists, this take precedence over any HASCURAND choice (but a warning is printed out) +###$(info RNDGEN=$(RNDGEN)) +ifneq ($(RNDGEN),) + $(warning Environment variable RNDGEN is no longer supported, please use HASCURAND instead!) + ifeq ($(RNDGEN),hasCurand) + override HASCURAND = $(RNDGEN) + else ifeq ($(RNDGEN),hasNoCurand) + override HASCURAND = $(RNDGEN) + else ifneq ($(RNDGEN),hasNoCurand) + $(error Unknown RNDGEN='$(RNDGEN)': only 'hasCurand' and 'hasNoCurand' are supported - but use HASCURAND instead!) + endif +endif + +# Set the default HASCURAND (curand random number generator) choice, if no prior choice exists for HASCURAND +# (NB: allow HASCURAND=hasCurand even if $(GPUCC) does not point to nvcc: assume CUDA_HOME was defined correctly...) +ifeq ($(HASCURAND),) + ifeq ($(GPUCC),) # CPU-only build + override HASCURAND = hasNoCurand + else ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build + override HASCURAND = hasCurand + else # non-Nvidia GPU build + override HASCURAND = hasNoCurand + endif +endif + +# Set the default HASHIPRAND (hiprand random number generator) choice, if no prior choice exists for HASHIPRAND +# (NB: allow HASHIPRAND=hasHiprand even if $(GPUCC) does not point to hipcc: assume HIP_HOME was defined correctly...) +ifeq ($(HASHIPRAND),) + ifeq ($(GPUCC),) # CPU-only build + override HASHIPRAND = hasNoHiprand + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build + override HASHIPRAND = hasHiprand + else # non-AMD GPU build + override HASHIPRAND = hasNoHiprand + endif +endif + +# Export HASCURAND, HASHIPRAND so that it is not necessary to pass them to the src Makefile too +# (NB: these variables in cudacpp_src.mk are only used to define the build tag, they are NOT needed for RNDCXXFLAGS or RNDLIBFLAGS) +export HASCURAND +export HASHIPRAND + +#------------------------------------------------------------------------------- + +#=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD # Set the build flags appropriate to OMPFLAGS $(info OMPFLAGS=$(OMPFLAGS)) @@ -432,13 +460,13 @@ CXXFLAGS+= $(AVXFLAGS) $(info FPTYPE=$(FPTYPE)) ifeq ($(FPTYPE),d) CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE - GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE + GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE else ifeq ($(FPTYPE),f) CXXFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT - GPUFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT + GPUFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT else ifeq ($(FPTYPE),m) CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT - GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT + GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT else $(error Unknown FPTYPE='$(FPTYPE)': only 'd', 'f' and 'm' are supported) endif @@ -447,7 +475,7 @@ endif $(info HELINL=$(HELINL)) ifeq ($(HELINL),1) CXXFLAGS += -DMGONGPU_INLINE_HELAMPS - GPUFLAGS += -DMGONGPU_INLINE_HELAMPS + GPUFLAGS += -DMGONGPU_INLINE_HELAMPS else ifneq ($(HELINL),0) $(error Unknown HELINL='$(HELINL)': only '0' and '1' are supported) endif @@ -456,21 +484,40 @@ endif $(info HRDCOD=$(HRDCOD)) ifeq ($(HRDCOD),1) CXXFLAGS += -DMGONGPU_HARDCODE_PARAM - GPUFLAGS += -DMGONGPU_HARDCODE_PARAM + GPUFLAGS += -DMGONGPU_HARDCODE_PARAM else ifneq ($(HRDCOD),0) $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported) endif -# Set the build flags appropriate to each RNDGEN choice (example: "make RNDGEN=hasNoCurand") -$(info RNDGEN=$(RNDGEN)) -ifeq ($(RNDGEN),hasNoCurand) - override CXXFLAGSCURAND = -DMGONGPU_HAS_NO_CURAND -else ifeq ($(RNDGEN),hasCurand) - override CXXFLAGSCURAND = + +#=== Set the CUDA/HIP/C++ compiler and linker flags appropriate to user-defined choices of HASCURAND, HASHIPRAND + +$(info HASCURAND=$(HASCURAND)) +$(info HASHIPRAND=$(HASHIPRAND)) +override RNDCXXFLAGS= +override RNDLIBFLAGS= + +# Set the RNDCXXFLAGS and RNDLIBFLAGS build flags appropriate to each HASCURAND choice (example: "make HASCURAND=hasNoCurand") +ifeq ($(HASCURAND),hasNoCurand) + override RNDCXXFLAGS += -DMGONGPU_HAS_NO_CURAND +else ifeq ($(HASCURAND),hasCurand) + override RNDLIBFLAGS += -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here! else - $(error Unknown RNDGEN='$(RNDGEN)': only 'hasCurand' and 'hasNoCurand' are supported) + $(error Unknown HASCURAND='$(HASCURAND)': only 'hasCurand' and 'hasNoCurand' are supported) +endif + +# Set the RNDCXXFLAGS and RNDLIBFLAGS build flags appropriate to each HASHIPRAND choice (example: "make HASHIPRAND=hasNoHiprand") +ifeq ($(HASHIPRAND),hasNoHiprand) + override RNDCXXFLAGS += -DMGONGPU_HAS_NO_HIPRAND +else ifeq ($(HASHIPRAND),hasHiprand) + override RNDLIBFLAGS += -L$(HIP_HOME)/lib/ -lhiprand +else ifneq ($(HASHIPRAND),hasHiprand) + $(error Unknown HASHIPRAND='$(HASHIPRAND)': only 'hasHiprand' and 'hasNoHiprand' are supported) endif +#$(info RNDCXXFLAGS=$(RNDCXXFLAGS)) +#$(info HASHIPRAND=$(HASHIPRAND)) + #------------------------------------------------------------------------------- #=== Configure build directories and build lockfiles === @@ -481,7 +528,7 @@ override DIRTAG = $(AVX)_$(FPTYPE)_inl$(HELINL)_hrd$(HRDCOD) # Build lockfile "full" tag (defines full specification of build options that cannot be intermixed) # (Rationale: avoid mixing of CUDA and no-CUDA environment builds with different random number generators) -override TAG = $(AVX)_$(FPTYPE)_inl$(HELINL)_hrd$(HRDCOD)_$(RNDGEN) +override TAG = $(AVX)_$(FPTYPE)_inl$(HELINL)_hrd$(HRDCOD)_$(HASCURAND)_$(HASHIPRAND) # Build directory: current directory by default, or build.$(DIRTAG) if USEBUILDDIR==1 ifeq ($(USEBUILDDIR),1) @@ -589,14 +636,19 @@ endif $(BUILDDIR)/check_sa.o: CXXFLAGS += $(USE_NVTX) $(CUINC) $(BUILDDIR)/check_sa_cu.o: CXXFLAGS += $(USE_NVTX) $(CUINC) -# Apply special build flags only to check_sa[_cu].o and CurandRandomNumberKernel[_cu].o (curand headers, #679) -$(BUILDDIR)/check_sa.o: CXXFLAGS += $(CXXFLAGSCURAND) -$(BUILDDIR)/check_sa_cu.o: CUFLAGS += $(CXXFLAGSCURAND) -$(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CXXFLAGSCURAND) -$(BUILDDIR)/CurandRandomNumberKernel_cu.o: CUFLAGS += $(CXXFLAGSCURAND) -ifeq ($(RNDGEN),hasCurand) +# Apply special build flags only to check_sa[_cu].o and (Cu|Hip)randRandomNumberKernel[_cu].o +$(BUILDDIR)/check_sa.o: CXXFLAGS += $(RNDCXXFLAGS) +$(BUILDDIR)/check_sa_cu.o: CUFLAGS += $(RNDCXXFLAGS) +$(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(RNDCXXFLAGS) +$(BUILDDIR)/CurandRandomNumberKernel_cu.o: CUFLAGS += $(RNDCXXFLAGS) +$(BUILDDIR)/HiprandRandomNumberKernel.o: CXXFLAGS += $(RNDCXXFLAGS) +$(BUILDDIR)/HiprandRandomNumberKernel_cu.o: CUFLAGS += $(RNDCXXFLAGS) +ifeq ($(HASCURAND),hasCurand) # curand headers, #679 $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUINC) endif +ifeq ($(HASHIPRAND),hasHiprand) # hiprand headers +$(BUILDDIR)/HiprandRandomNumberKernel.o: CXXFLAGS += $(HIPINC) +endif # Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592) ifneq ($(shell $(CXX) --version | egrep '^(Intel)'),) @@ -673,8 +725,8 @@ endif # Target (and build rules): C++ and CUDA standalone executables $(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o - $(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CURANDLIBFLAGS) +$(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(BUILDDIR)/HiprandRandomNumberKernel.o + $(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(BUILDDIR)/HiprandRandomNumberKernel.o $(RNDLIBFLAGS) ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) @@ -684,8 +736,8 @@ else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(cu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif $(cu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(cu_main): $(BUILDDIR)/check_sa_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_cu.o - $(GPUCC) -o $@ $(BUILDDIR)/check_sa_cu.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_cu.o $(CURANDLIBFLAGS) +$(cu_main): $(BUILDDIR)/check_sa_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_cu.o $(BUILDDIR)/HiprandRandomNumberKernel_cu.o + $(GPUCC) -o $@ $(BUILDDIR)/check_sa_cu.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_cu.o $(BUILDDIR)/HiprandRandomNumberKernel_cu.o $(RNDLIBFLAGS) endif #------------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/runTest.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/runTest.cc index de327f2321..7f248d29a4 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/runTest.cc +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/runTest.cc @@ -238,7 +238,7 @@ struct CUDATest : public CUDA_CPU_TestBase return MemoryAccessMatrixElements::ieventAccessConst( hstMatrixElements.data(), ievt ); } }; -#endif +#endif /* clang-format off */ // Use two levels of macros to force stringification at the right level // (see https://gcc.gnu.org/onlinedocs/gcc-3.0.1/cpp_3.html#SEC17 and https://stackoverflow.com/a/3419392) @@ -260,4 +260,4 @@ INSTANTIATE_TEST_SUITE_P( prefix, \ MG_INSTANTIATE_TEST_SUITE_GPU( XTESTID_GPU( MG_EPOCH_PROCESS_ID ), MadgraphTest ); #else MG_INSTANTIATE_TEST_SUITE_CPU( XTESTID_CPU( MG_EPOCH_PROCESS_ID ), MadgraphTest ); -#endif +#endif /* clang-format on */ diff --git a/epochX/cudacpp/gg_ttgg.mad/src/cudacpp_src.mk b/epochX/cudacpp/gg_ttgg.mad/src/cudacpp_src.mk index b2b9da5288..d7d51259d2 100644 --- a/epochX/cudacpp/gg_ttgg.mad/src/cudacpp_src.mk +++ b/epochX/cudacpp/gg_ttgg.mad/src/cudacpp_src.mk @@ -45,13 +45,13 @@ endif #------------------------------------------------------------------------------- -#=== Configure the CUDA compiler (note: GPUCC is already exported including ccache) +#=== Configure the CUDA compiler (note: GPUCC have been exported from cudacpp.mk including ccache) ###$(info GPUCC=$(GPUCC)) #------------------------------------------------------------------------------- -#=== Configure ccache for C++ builds (note: GPUCC is already exported including ccache) +#=== Configure ccache for C++ builds (note: GPUCC have been exported from cudacpp.mk including ccache) # Enable ccache if USECCACHE=1 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) @@ -86,7 +86,8 @@ endif #------------------------------------------------------------------------------- -#=== Set the CUDA/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN +#=== Set the CUDA/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD (exported from cudacpp.mk) +#=== (NB the RNDCXXFLAGS and RNDLIBFLAGS appropriate to user-defined choices of HASCURAND and HASHIPRAND have been exported from cudacpp.mk) # Set the build flags appropriate to OMPFLAGS ###$(info OMPFLAGS=$(OMPFLAGS)) @@ -175,14 +176,6 @@ else ifneq ($(HRDCOD),0) $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported) endif -# Set the build flags appropriate to each RNDGEN choice (example: "make RNDGEN=hasNoCurand") -###$(info RNDGEN=$(RNDGEN)) -ifeq ($(RNDGEN),hasNoCurand) - CXXFLAGS += -DMGONGPU_HAS_NO_CURAND -else ifneq ($(RNDGEN),hasCurand) - $(error Unknown RNDGEN='$(RNDGEN)': only 'hasCurand' and 'hasNoCurand' are supported) -endif - #------------------------------------------------------------------------------- #=== Configure build directories and build lockfiles === @@ -193,7 +186,7 @@ override DIRTAG = $(AVX)_$(FPTYPE)_inl$(HELINL)_hrd$(HRDCOD) # Build lockfile "full" tag (defines full specification of build options that cannot be intermixed) # (Rationale: avoid mixing of CUDA and no-CUDA environment builds with different random number generators) -override TAG = $(AVX)_$(FPTYPE)_inl$(HELINL)_hrd$(HRDCOD)_$(RNDGEN) +override TAG = $(AVX)_$(FPTYPE)_inl$(HELINL)_hrd$(HRDCOD)_$(HASCURAND)_$(HASHIPRAND) # Build directory: current directory by default, or build.$(DIRTAG) if USEBUILDDIR==1 ###$(info Current directory is $(shell pwd)) diff --git a/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuConfig.h index 6bde4466d0..33306bc3e2 100644 --- a/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuConfig.h @@ -20,11 +20,15 @@ #undef MGONGPUCPP_GPUIMPL #endif +// Make sure that __HIP_PLATFORM_NVIDIA__ is undefined +// (__HIP_PLATFORM_AMD__ is defined by hipcc or in HiprandRandomNumberKernel.cc) +#undef __HIP_PLATFORM_NVIDIA__ // disable hiprand for NVidia (curand) + // ** NB1 Throughputs (e.g. 6.8E8) are events/sec for "./gcheck.exe -p 65536 128 12" // ** NB2 Baseline on b7g47n0004 fluctuates (probably depends on load on other VMs) // Choose if curand is supported for generating random numbers -// For HIP, by default, do not use curand (common random numbers will be used instead) +// For HIP, by default, do not allow curand to be used (hiprand or common random numbers will be used instead) // For both CUDA and C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND // (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784 and #785) #if defined __HIPCC__ @@ -39,6 +43,22 @@ //#endif #endif +// Choose if hiprand is supported for generating random numbers +// For CUDA, by default, do not allow hiprand to be used (curand or common random numbers will be used instead) +// For both HIP and C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_HIPRAND +// (there may exist HIP installations which do not include hiprand?) +#if defined __CUDACC__ +#define MGONGPU_HAS_NO_HIPRAND 1 +#else +//#ifdef __HIPCC__ +//#undef MGONGPU_HAS_NO_HIPRAND // default +////#define MGONGPU_HAS_NO_HIPRAND 1 +//#else +//#undef MGONGPU_HAS_NO_HIPRAND // default +////#define MGONGPU_HAS_NO_HIPRAND 1 +//#endif +#endif + // Choose floating point precision (for everything but color algebra #537) // If one of these macros has been set from outside with e.g. -DMGONGPU_FPTYPE_FLOAT, nothing happens (issue #167) #if not defined MGONGPU_FPTYPE_DOUBLE and not defined MGONGPU_FPTYPE_FLOAT diff --git a/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt b/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt index 97550bc867..8a72b5a0f4 100644 --- a/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt +++ b/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ g g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005414724349975586  +DEBUG: model prefixing takes 0.0057239532470703125  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=4: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g g WEIGHTED<=4 @1 INFO: Process has 123 diagrams -1 processes with 123 diagrams generated in 0.161 s +1 processes with 123 diagrams generated in 0.157 s Total: 1 processes with 123 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_ttgg Load PLUGIN.CUDACPP_OUTPUT @@ -163,29 +163,29 @@ Load PLUGIN.CUDACPP_OUTPUT It has been validated for the last time with version: 3.5.2 Output will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: cformat =  plugin [export_cpp.py at line 3071]  -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 160]  -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 165]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 161]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 166]  INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ g g WEIGHTED<=4 @1 INFO: Processing color information for process: g g > t t~ g g @1 -DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 194]  -DEBUG: type(subproc_group)= [output.py at line 195]  -DEBUG: type(fortran_model)= [output.py at line 196]  -DEBUG: type(me)= me=0 [output.py at line 197]  +DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 195]  +DEBUG: type(subproc_group)= [output.py at line 196]  +DEBUG: type(fortran_model)= [output.py at line 197]  +DEBUG: type(me)= me=0 [output.py at line 198]  INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/./CPPProcess.h FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/./CPPProcess.cc INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/. -Generated helas calls for 1 subprocesses (123 diagrams) in 0.432 s -DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202]  +Generated helas calls for 1 subprocesses (123 diagrams) in 0.421 s +DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 203]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.323 s +ALOHA: aloha creates 5 routines in 0.320 s VVV1 VVV1 FFV1 @@ -208,7 +208,7 @@ INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/. quit -real 0m1.483s -user 0m1.404s -sys 0m0.054s -Code generation completed in 2 seconds +real 0m1.486s +user 0m1.373s +sys 0m0.056s +Code generation completed in 1 seconds diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/CurandRandomNumberKernel.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/CurandRandomNumberKernel.cc index 08a16f6f2c..c160c5e06b 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/CurandRandomNumberKernel.cc +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/CurandRandomNumberKernel.cc @@ -10,6 +10,7 @@ #include #ifndef MGONGPU_HAS_NO_CURAND /* clang-format off */ +// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when MGONGPUCPP_GPUIMPL is not defined #include "curand.h" #define checkCurand( code ){ assertCurand( code, __FILE__, __LINE__ ); } inline void assertCurand( curandStatus_t code, const char *file, int line, bool abort = true ) diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/HiprandRandomNumberKernel.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/HiprandRandomNumberKernel.cc new file mode 100644 index 0000000000..2e4534f9d4 --- /dev/null +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/HiprandRandomNumberKernel.cc @@ -0,0 +1,145 @@ +// Copyright (C) 2020-2024 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Jan 2024) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2024) for the MG5aMC CUDACPP plugin. + +#include "mgOnGpuConfig.h" + +#include "GpuRuntime.h" +#include "MemoryBuffers.h" +#include "RandomNumberKernels.h" + +#include + +#ifndef MGONGPU_HAS_NO_HIPRAND /* clang-format off */ +#ifndef __HIP_PLATFORM_AMD__ +#define __HIP_PLATFORM_AMD__ 1 // enable hiprand for AMD (rocrand) +#endif +#include +#define checkHiprand( code ){ assertHiprand( code, __FILE__, __LINE__ ); } +inline void assertHiprand( hiprandStatus_t code, const char *file, int line, bool abort = true ) +{ + if ( code != HIPRAND_STATUS_SUCCESS ) + { + printf( "HiprandAssert: %s:%d code=%d\n", file, line, code ); + if ( abort ) assert( code == HIPRAND_STATUS_SUCCESS ); + } +} +#endif /* clang-format on */ + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + //-------------------------------------------------------------------------- +#ifndef MGONGPU_HAS_NO_HIPRAND + HiprandRandomNumberKernel::HiprandRandomNumberKernel( BufferRndNumMomenta& rnarray, const bool onDevice ) + : RandomNumberKernelBase( rnarray ) + , m_isOnDevice( onDevice ) + { + if( m_isOnDevice ) + { +#ifdef MGONGPUCPP_GPUIMPL + if( !m_rnarray.isOnDevice() ) + throw std::runtime_error( "HiprandRandomNumberKernel on device with a host random number array" ); +#else + throw std::runtime_error( "HiprandRandomNumberKernel does not support HiprandDevice on CPU host" ); +#endif + } + else + { + if( m_rnarray.isOnDevice() ) + throw std::runtime_error( "HiprandRandomNumberKernel on host with a device random number array" ); + } + createGenerator(); + } + + //-------------------------------------------------------------------------- + + HiprandRandomNumberKernel::~HiprandRandomNumberKernel() + { + destroyGenerator(); + } + + //-------------------------------------------------------------------------- + + void HiprandRandomNumberKernel::seedGenerator( const unsigned int seed ) + { + if( m_isOnDevice ) + { + destroyGenerator(); // workaround for #429 + createGenerator(); // workaround for #429 + } + //printf( "seedGenerator: seed %d\n", seed ); + checkHiprand( hiprandSetPseudoRandomGeneratorSeed( m_rnGen, seed ) ); + } + + //-------------------------------------------------------------------------- + + void HiprandRandomNumberKernel::createGenerator() + { + //const hiprandRngType_t type = HIPRAND_RNG_PSEUDO_DEFAULT; + //const hiprandRngType_t type = HIPRAND_RNG_PSEUDO_XORWOW; + //const hiprandRngType_t type = HIPRAND_RNG_PSEUDO_MRG32K3A; + const hiprandRngType_t type = HIPRAND_RNG_PSEUDO_MTGP32; // same as curand; not implemented yet (code=1000) in host code + //const hiprandRngType_t type = HIPRAND_RNG_PSEUDO_MT19937; + //const hiprandRngType_t type = HIPRAND_RNG_PSEUDO_PHILOX4_32_10; + if( m_isOnDevice ) + { + checkHiprand( hiprandCreateGenerator( &m_rnGen, type ) ); + } + else + { + // See https://github.com/ROCm/hipRAND/issues/76 + throw std::runtime_error( "HiprandRandomNumberKernel on host is not supported yet (hiprandCreateGeneratorHost is not implemented yet)" ); + //checkHiprand( hiprandCreateGeneratorHost( &m_rnGen, type ) ); // ALWAYS FAILS WITH CODE=1000 + } + // FIXME: hiprand ordering is not implemented yet + // See https://github.com/ROCm/hipRAND/issues/75 + /* + //checkHiprand( hiprandSetGeneratorOrdering( *&m_rnGen, HIPRAND_ORDERING_PSEUDO_LEGACY ) ); + checkHiprand( hiprandSetGeneratorOrdering( *&m_rnGen, HIPRAND_ORDERING_PSEUDO_BEST ) ); + //checkHiprand( hiprandSetGeneratorOrdering( *&m_rnGen, HIPRAND_ORDERING_PSEUDO_DYNAMIC ) ); + //checkHiprand( hiprandSetGeneratorOrdering( *&m_rnGen, HIPRAND_ORDERING_PSEUDO_SEEDED ) ); + */ + } + + //-------------------------------------------------------------------------- + + void HiprandRandomNumberKernel::destroyGenerator() + { + checkHiprand( hiprandDestroyGenerator( m_rnGen ) ); + } + + //-------------------------------------------------------------------------- + + void HiprandRandomNumberKernel::generateRnarray() + { +#if defined MGONGPU_FPTYPE_DOUBLE + checkHiprand( hiprandGenerateUniformDouble( m_rnGen, m_rnarray.data(), m_rnarray.size() ) ); +#elif defined MGONGPU_FPTYPE_FLOAT + checkHiprand( hiprandGenerateUniform( m_rnGen, m_rnarray.data(), m_rnarray.size() ) ); +#endif + /* + printf( "\nHiprandRandomNumberKernel::generateRnarray size = %d\n", (int)m_rnarray.size() ); + fptype* data = m_rnarray.data(); +#ifdef MGONGPUCPP_GPUIMPL + if( m_rnarray.isOnDevice() ) + { + data = new fptype[m_rnarray.size()](); + checkCuda( cudaMemcpy( data, m_rnarray.data(), m_rnarray.bytes(), cudaMemcpyDeviceToHost ) ); + } +#endif + for( int i = 0; i < ( (int)m_rnarray.size() / 4 ); i++ ) + printf( "[%4d] %f %f %f %f\n", i * 4, data[i * 4], data[i * 4 + 2], data[i * 4 + 2], data[i * 4 + 3] ); +#ifdef MGONGPUCPP_GPUIMPL + if( m_rnarray.isOnDevice() ) delete[] data; +#endif + */ + } + + //-------------------------------------------------------------------------- +#endif +} diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/HiprandRandomNumberKernel.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/HiprandRandomNumberKernel.cc new file mode 120000 index 0000000000..6691864f78 --- /dev/null +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/HiprandRandomNumberKernel.cc @@ -0,0 +1 @@ +../HiprandRandomNumberKernel.cc \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/check_sa.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/check_sa.cc index bde384c69e..e086ae12d9 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/check_sa.cc +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/check_sa.cc @@ -1,10 +1,10 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2023 CERN and UCLouvain. +// Copyright (C) 2020-2024 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -58,7 +58,7 @@ int usage( char* argv0, int ret = 1 ) { std::cout << "Usage: " << argv0 - << " [--verbose|-v] [--debug|-d] [--performance|-p] [--json|-j] [--curhst|--curdev|--common] [--rmbhst|--rmbdev] [--bridge]" + << " [--verbose|-v] [--debug|-d] [--performance|-p] [--json|-j] [--curhst|--curdev|--hirhst|--hirdev|--common] [--rmbhst|--rmbdev] [--bridge]" << " [#gpuBlocksPerGrid #gpuThreadsPerBlock] #iterations" << std::endl; std::cout << std::endl; std::cout << "The number of events per iteration is #gpuBlocksPerGrid * #gpuThreadsPerBlock" << std::endl; @@ -131,17 +131,31 @@ main( int argc, char** argv ) enum class RandomNumberMode { CommonRandom = 0, - CurandHost = 1, - CurandDevice = 2 + CurandHost = -1, + CurandDevice = 1, + HiprandHost = -2, + HiprandDevice = 2 }; -#ifdef MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785) -#elif defined __HIPCC__ -#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random -#elif defined __CUDACC__ +#if defined __CUDACC__ +#ifndef MGONGPU_HAS_NO_CURAND RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand #else + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on NVidia GPU if build has no curand (PR #784 and #785) +#endif +#elif defined __HIPCC__ +#ifndef MGONGPU_HAS_NO_HIPRAND + RandomNumberMode rndgen = RandomNumberMode::HiprandDevice; // default on AMD GPU if build has hiprand +#else + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on AMD GPU if build has no hiprand +#endif +#else +#ifndef MGONGPU_HAS_NO_CURAND RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand +#elif not defined MGONGPU_HAS_NO_HIPRAND + RandomNumberMode rndgen = RandomNumberMode::HiprandDevice; // default on CPU if build has hiprand +#else + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has neither curand nor hiprand +#endif #endif // Rambo sampling mode (NB RamboHost implies CommonRandom or CurandHost!) enum class RamboSamplingMode @@ -152,7 +166,7 @@ main( int argc, char** argv ) #ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -193,6 +207,26 @@ main( int argc, char** argv ) throw std::runtime_error( "CurandHost is not supported because this application was built without Curand support" ); #else rndgen = RandomNumberMode::CurandHost; +#endif + } + else if( arg == "--hirdev" ) + { +#ifndef __HIPCC__ + throw std::runtime_error( "HiprandDevice is not supported on CPUs or non-AMD GPUs" ); +#elif defined MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "HiprandDevice is not supported because this application was built without Hiprand support" ); +#else + rndgen = RandomNumberMode::HiprandDevice; +#endif + } + else if( arg == "--hirhst" ) + { +#ifdef MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "HiprandHost is not supported because this application was built without Hiprand support" ); +#else + // See https://github.com/ROCm/hipRAND/issues/76 + throw std::runtime_error( "HiprandRandomNumberKernel on host is not supported yet (hiprandCreateGeneratorHost is not implemented yet)" ); + //rndgen = RandomNumberMode::HiprandHost; #endif } else if( arg == "--common" ) @@ -265,6 +299,20 @@ main( int argc, char** argv ) #endif } + if( rmbsmp == RamboSamplingMode::RamboHost && rndgen == RandomNumberMode::HiprandDevice ) + { +#if not defined MGONGPU_HAS_NO_HIPRAND + // See https://github.com/ROCm/hipRAND/issues/76 + //std::cout << "WARNING! RamboHost selected: cannot use HiprandDevice, will use HiprandHost" << std::endl; + //rndgen = RandomNumberMode::HiprandHost; + std::cout << "WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet)" << std::endl; + rndgen = RandomNumberMode::CommonRandom; +#else + std::cout << "WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom" << std::endl; + rndgen = RandomNumberMode::CommonRandom; +#endif + } + constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout constexpr int neppR = MemoryAccessRandomNumbers::neppR; // AOSOA layout @@ -415,7 +463,7 @@ main( int argc, char** argv ) std::unique_ptr wavetimes( new double[niter] ); std::unique_ptr wv3atimes( new double[niter] ); - // --- 0c. Create curand or common generator + // --- 0c. Create curand, hiprand or common generator const std::string cgenKey = "0c GenCreat"; timermap.start( cgenKey ); // Allocate the appropriate RandomNumberKernel @@ -433,7 +481,7 @@ main( int argc, char** argv ) prnk.reset( new CurandRandomNumberKernel( hstRndmom, onDevice ) ); #endif } - else + else if( rndgen == RandomNumberMode::CurandDevice ) { #ifdef MGONGPU_HAS_NO_CURAND throw std::runtime_error( "INTERNAL ERROR! CurandDevice is not supported because this application was built without Curand support" ); // INTERNAL ERROR (no path to this statement) @@ -441,9 +489,31 @@ main( int argc, char** argv ) const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); #else - throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif } + else if( rndgen == RandomNumberMode::HiprandHost ) + { +#ifdef MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "INTERNAL ERROR! HiprandHost is not supported because this application was built without Hiprand support" ); // INTERNAL ERROR (no path to this statement) +#else + const bool onDevice = false; + prnk.reset( new HiprandRandomNumberKernel( hstRndmom, onDevice ) ); +#endif + } + else if( rndgen == RandomNumberMode::HiprandDevice ) + { +#ifdef MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "INTERNAL ERROR! HiprandDevice is not supported because this application was built without Hiprand support" ); // INTERNAL ERROR (no path to this statement) +#elif defined __HIPCC__ + const bool onDevice = true; + prnk.reset( new HiprandRandomNumberKernel( devRndmom, onDevice ) ); +#else + throw std::logic_error( "INTERNAL ERROR! HiprandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) +#endif + } + else + throw std::logic_error( "INTERNAL ERROR! Unknown rndgen value?" ); // INTERNAL ERROR (no path to this statement) // --- 0c. Create rambo sampling kernel [keep this in 0c for the moment] std::unique_ptr prsk; @@ -497,7 +567,7 @@ main( int argc, char** argv ) // *** START THE OLD-STYLE TIMER FOR RANDOM GEN *** double genrtime = 0; - // --- 1a. Seed rnd generator (to get same results on host and device in curand) + // --- 1a. Seed rnd generator (to get same results on host and device in curand/hiprand) // [NB This should not be necessary using the host API: "Generation functions // can be called multiple times on the same generator to generate successive // blocks of results. For pseudorandom generators, multiple calls to generation @@ -515,7 +585,9 @@ main( int argc, char** argv ) //std::cout << "Got random numbers" << std::endl; #ifdef MGONGPUCPP_GPUIMPL - if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) + if( rndgen != RandomNumberMode::CurandDevice && + rndgen != RandomNumberMode::HiprandDevice && + rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device const std::string htodKey = "1c CpHTDrnd"; @@ -761,6 +833,10 @@ main( int argc, char** argv ) rndgentxt = "CURAND HOST"; else if( rndgen == RandomNumberMode::CurandDevice ) rndgentxt = "CURAND DEVICE"; + else if( rndgen == RandomNumberMode::HiprandHost ) + rndgentxt = "ROCRAND HOST"; + else if( rndgen == RandomNumberMode::HiprandDevice ) + rndgentxt = "ROCRAND DEVICE"; #ifdef __CUDACC__ rndgentxt += " (CUDA code)"; #elif defined __HIPCC__ @@ -788,7 +864,7 @@ main( int argc, char** argv ) wrkflwtxt += "FLT+"; #else wrkflwtxt += "???+"; // no path to this statement -#endif /* clang-format on */ +#endif // -- CUCOMPLEX or THRUST or STD or CXSIMPLE complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -813,7 +889,7 @@ main( int argc, char** argv ) wrkflwtxt += "CXS:"; #else wrkflwtxt += "???:"; // no path to this statement -#endif +#endif /* clang-format on */ #endif // -- COMMON or CURAND HOST or CURAND DEVICE random numbers? if( rndgen == RandomNumberMode::CommonRandom ) @@ -822,6 +898,10 @@ main( int argc, char** argv ) wrkflwtxt += "CURHST+"; else if( rndgen == RandomNumberMode::CurandDevice ) wrkflwtxt += "CURDEV+"; + else if( rndgen == RandomNumberMode::HiprandHost ) + wrkflwtxt += "HIRHST+"; + else if( rndgen == RandomNumberMode::HiprandDevice ) + wrkflwtxt += "HIRDEV+"; else wrkflwtxt += "??????+"; // no path to this statement // -- HOST or DEVICE rambo sampling? @@ -1099,7 +1179,7 @@ main( int argc, char** argv ) #ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif - << "\"Curand generation\": " + << "\"Random generation\": " << "\"" << rndgentxt << "\"," << std::endl; double minelem = hstStats.minME; diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/RandomNumberKernels.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/RandomNumberKernels.h index 21d63beeac..7ed728a26c 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/RandomNumberKernels.h +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/RandomNumberKernels.h @@ -1,21 +1,22 @@ -// Copyright (C) 2020-2023 CERN and UCLouvain. +// Copyright (C) 2020-2024 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. #ifndef RANDOMNUMBERKERNELS_H #define RANDOMNUMBERKERNELS_H 1 #include "mgOnGpuConfig.h" -// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when MGONGPUCPP_GPUIMPL is not defined -#ifndef MGONGPU_HAS_NO_CURAND -//#include "curand.h" -struct curandGenerator_st; // forward definition from curand.h -#endif - #include "MemoryBuffers.h" +// Forward definition from curand.h (the full header is only needed in CurandRandomKernel.cc) +struct curandGenerator_st; + +// Forward definition from hiprand.h (the full header is only needed in HiprandRandomKernel.cc) +struct rocrand_generator_base_type; +typedef rocrand_generator_base_type hiprandGenerator_st; + #ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else @@ -107,7 +108,6 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef MGONGPU_HAS_NO_CURAND // A class encapsulating CURAND random number generation on a CPU host or on a GPU device class CurandRandomNumberKernel final : public RandomNumberKernelBase { @@ -142,11 +142,49 @@ namespace mg5amcCpu const bool m_isOnDevice; // The curand generator - // (NB: curand.h defines typedef generator_t as a pointer to forward-defined 'struct curandGenerator_st') + // (NB: curand.h defines typedef curandGenerator_t as a pointer to forward-defined 'struct curandGenerator_st') curandGenerator_st* m_rnGen; }; -#endif + //-------------------------------------------------------------------------- + + // A class encapsulating HIPRAND random number generation on a CPU host or on a GPU device + class HiprandRandomNumberKernel final : public RandomNumberKernelBase + { + public: + + // Constructor from an existing output buffer + HiprandRandomNumberKernel( BufferRndNumMomenta& rnarray, const bool onDevice ); + + // Destructor + ~HiprandRandomNumberKernel(); + + // Seed the random number generator + void seedGenerator( const unsigned int seed ) override final; + + // Generate the random number array + void generateRnarray() override final; + + // Is this a host or device kernel? + bool isOnDevice() const override final { return m_isOnDevice; } + + private: + + // Create the generator (workaround for #429: do this in every seedGenerator call rather than only in the ctor) + void createGenerator(); + + // Destroy the generator (workaround for #429: do this in every seedGenerator call rather than only in the ctor) + void destroyGenerator(); + + private: + + // Is this a host or device kernel? + const bool m_isOnDevice; + + // The hiprand generator + // (NB: hiprand.h defines typedef hiprandGenerator_t as a pointer to forward-defined 'struct hiprandGenerator_st') + hiprandGenerator_st* m_rnGen; + }; //-------------------------------------------------------------------------- } diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk index 1077bdc098..3ad91dfd59 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk @@ -173,11 +173,6 @@ ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) comma:=, CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch)) CUINC = -I$(CUDA_HOME)/include/ - ifeq ($(RNDGEN),hasNoCurand) - CURANDLIBFLAGS= - else - CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here! - endif CUOPTFLAGS = -lineinfo ###GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math GPUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math @@ -241,7 +236,7 @@ else override GPUCC= override USE_NVTX= override CUINC= - override CURANDLIBFLAGS= + override HIPINC= endif @@ -291,7 +286,7 @@ endif #------------------------------------------------------------------------------- -#=== Configure defaults and check if user-defined choices exist for OMPFLAGS, AVX, FPTYPE, HELINL, HRDCOD, RNDGEN +#=== Configure defaults and check if user-defined choices exist for OMPFLAGS, AVX, FPTYPE, HELINL, HRDCOD # Set the default OMPFLAGS choice ifneq ($(findstring hipcc,$(GPUCC)),) @@ -352,29 +347,62 @@ ifeq ($(HRDCOD),) override HRDCOD = 0 endif -# Set the default RNDGEN (random number generator) choice -ifeq ($(RNDGEN),) - ifeq ($(GPUCC),) - override RNDGEN = hasNoCurand - # Edgecase for HIP compilation - else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) - override RNDGEN = hasNoCurand - else ifeq ($(RNDGEN),) - override RNDGEN = hasCurand - endif -endif - -# Export AVX, FPTYPE, HELINL, HRDCOD, RNDGEN, OMPFLAGS so that it is not necessary to pass them to the src Makefile too +# Export AVX, FPTYPE, HELINL, HRDCOD, OMPFLAGS so that it is not necessary to pass them to the src Makefile too export AVX export FPTYPE export HELINL export HRDCOD -export RNDGEN export OMPFLAGS #------------------------------------------------------------------------------- -#=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN +#=== Configure defaults and check if user-defined choices exist for RNDGEN (legacy!), HASCURAND, HASHIPRAND + +# If the legacy RNDGEN exists, this take precedence over any HASCURAND choice (but a warning is printed out) +###$(info RNDGEN=$(RNDGEN)) +ifneq ($(RNDGEN),) + $(warning Environment variable RNDGEN is no longer supported, please use HASCURAND instead!) + ifeq ($(RNDGEN),hasCurand) + override HASCURAND = $(RNDGEN) + else ifeq ($(RNDGEN),hasNoCurand) + override HASCURAND = $(RNDGEN) + else ifneq ($(RNDGEN),hasNoCurand) + $(error Unknown RNDGEN='$(RNDGEN)': only 'hasCurand' and 'hasNoCurand' are supported - but use HASCURAND instead!) + endif +endif + +# Set the default HASCURAND (curand random number generator) choice, if no prior choice exists for HASCURAND +# (NB: allow HASCURAND=hasCurand even if $(GPUCC) does not point to nvcc: assume CUDA_HOME was defined correctly...) +ifeq ($(HASCURAND),) + ifeq ($(GPUCC),) # CPU-only build + override HASCURAND = hasNoCurand + else ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build + override HASCURAND = hasCurand + else # non-Nvidia GPU build + override HASCURAND = hasNoCurand + endif +endif + +# Set the default HASHIPRAND (hiprand random number generator) choice, if no prior choice exists for HASHIPRAND +# (NB: allow HASHIPRAND=hasHiprand even if $(GPUCC) does not point to hipcc: assume HIP_HOME was defined correctly...) +ifeq ($(HASHIPRAND),) + ifeq ($(GPUCC),) # CPU-only build + override HASHIPRAND = hasNoHiprand + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build + override HASHIPRAND = hasHiprand + else # non-AMD GPU build + override HASHIPRAND = hasNoHiprand + endif +endif + +# Export HASCURAND, HASHIPRAND so that it is not necessary to pass them to the src Makefile too +# (NB: these variables in cudacpp_src.mk are only used to define the build tag, they are NOT needed for RNDCXXFLAGS or RNDLIBFLAGS) +export HASCURAND +export HASHIPRAND + +#------------------------------------------------------------------------------- + +#=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD # Set the build flags appropriate to OMPFLAGS $(info OMPFLAGS=$(OMPFLAGS)) @@ -432,13 +460,13 @@ CXXFLAGS+= $(AVXFLAGS) $(info FPTYPE=$(FPTYPE)) ifeq ($(FPTYPE),d) CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE - GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE + GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE else ifeq ($(FPTYPE),f) CXXFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT - GPUFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT + GPUFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT else ifeq ($(FPTYPE),m) CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT - GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT + GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT else $(error Unknown FPTYPE='$(FPTYPE)': only 'd', 'f' and 'm' are supported) endif @@ -447,7 +475,7 @@ endif $(info HELINL=$(HELINL)) ifeq ($(HELINL),1) CXXFLAGS += -DMGONGPU_INLINE_HELAMPS - GPUFLAGS += -DMGONGPU_INLINE_HELAMPS + GPUFLAGS += -DMGONGPU_INLINE_HELAMPS else ifneq ($(HELINL),0) $(error Unknown HELINL='$(HELINL)': only '0' and '1' are supported) endif @@ -456,21 +484,40 @@ endif $(info HRDCOD=$(HRDCOD)) ifeq ($(HRDCOD),1) CXXFLAGS += -DMGONGPU_HARDCODE_PARAM - GPUFLAGS += -DMGONGPU_HARDCODE_PARAM + GPUFLAGS += -DMGONGPU_HARDCODE_PARAM else ifneq ($(HRDCOD),0) $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported) endif -# Set the build flags appropriate to each RNDGEN choice (example: "make RNDGEN=hasNoCurand") -$(info RNDGEN=$(RNDGEN)) -ifeq ($(RNDGEN),hasNoCurand) - override CXXFLAGSCURAND = -DMGONGPU_HAS_NO_CURAND -else ifeq ($(RNDGEN),hasCurand) - override CXXFLAGSCURAND = + +#=== Set the CUDA/HIP/C++ compiler and linker flags appropriate to user-defined choices of HASCURAND, HASHIPRAND + +$(info HASCURAND=$(HASCURAND)) +$(info HASHIPRAND=$(HASHIPRAND)) +override RNDCXXFLAGS= +override RNDLIBFLAGS= + +# Set the RNDCXXFLAGS and RNDLIBFLAGS build flags appropriate to each HASCURAND choice (example: "make HASCURAND=hasNoCurand") +ifeq ($(HASCURAND),hasNoCurand) + override RNDCXXFLAGS += -DMGONGPU_HAS_NO_CURAND +else ifeq ($(HASCURAND),hasCurand) + override RNDLIBFLAGS += -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here! else - $(error Unknown RNDGEN='$(RNDGEN)': only 'hasCurand' and 'hasNoCurand' are supported) + $(error Unknown HASCURAND='$(HASCURAND)': only 'hasCurand' and 'hasNoCurand' are supported) +endif + +# Set the RNDCXXFLAGS and RNDLIBFLAGS build flags appropriate to each HASHIPRAND choice (example: "make HASHIPRAND=hasNoHiprand") +ifeq ($(HASHIPRAND),hasNoHiprand) + override RNDCXXFLAGS += -DMGONGPU_HAS_NO_HIPRAND +else ifeq ($(HASHIPRAND),hasHiprand) + override RNDLIBFLAGS += -L$(HIP_HOME)/lib/ -lhiprand +else ifneq ($(HASHIPRAND),hasHiprand) + $(error Unknown HASHIPRAND='$(HASHIPRAND)': only 'hasHiprand' and 'hasNoHiprand' are supported) endif +#$(info RNDCXXFLAGS=$(RNDCXXFLAGS)) +#$(info HASHIPRAND=$(HASHIPRAND)) + #------------------------------------------------------------------------------- #=== Configure build directories and build lockfiles === @@ -481,7 +528,7 @@ override DIRTAG = $(AVX)_$(FPTYPE)_inl$(HELINL)_hrd$(HRDCOD) # Build lockfile "full" tag (defines full specification of build options that cannot be intermixed) # (Rationale: avoid mixing of CUDA and no-CUDA environment builds with different random number generators) -override TAG = $(AVX)_$(FPTYPE)_inl$(HELINL)_hrd$(HRDCOD)_$(RNDGEN) +override TAG = $(AVX)_$(FPTYPE)_inl$(HELINL)_hrd$(HRDCOD)_$(HASCURAND)_$(HASHIPRAND) # Build directory: current directory by default, or build.$(DIRTAG) if USEBUILDDIR==1 ifeq ($(USEBUILDDIR),1) @@ -589,14 +636,19 @@ endif $(BUILDDIR)/check_sa.o: CXXFLAGS += $(USE_NVTX) $(CUINC) $(BUILDDIR)/check_sa_cu.o: CXXFLAGS += $(USE_NVTX) $(CUINC) -# Apply special build flags only to check_sa[_cu].o and CurandRandomNumberKernel[_cu].o (curand headers, #679) -$(BUILDDIR)/check_sa.o: CXXFLAGS += $(CXXFLAGSCURAND) -$(BUILDDIR)/check_sa_cu.o: CUFLAGS += $(CXXFLAGSCURAND) -$(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CXXFLAGSCURAND) -$(BUILDDIR)/CurandRandomNumberKernel_cu.o: CUFLAGS += $(CXXFLAGSCURAND) -ifeq ($(RNDGEN),hasCurand) +# Apply special build flags only to check_sa[_cu].o and (Cu|Hip)randRandomNumberKernel[_cu].o +$(BUILDDIR)/check_sa.o: CXXFLAGS += $(RNDCXXFLAGS) +$(BUILDDIR)/check_sa_cu.o: CUFLAGS += $(RNDCXXFLAGS) +$(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(RNDCXXFLAGS) +$(BUILDDIR)/CurandRandomNumberKernel_cu.o: CUFLAGS += $(RNDCXXFLAGS) +$(BUILDDIR)/HiprandRandomNumberKernel.o: CXXFLAGS += $(RNDCXXFLAGS) +$(BUILDDIR)/HiprandRandomNumberKernel_cu.o: CUFLAGS += $(RNDCXXFLAGS) +ifeq ($(HASCURAND),hasCurand) # curand headers, #679 $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUINC) endif +ifeq ($(HASHIPRAND),hasHiprand) # hiprand headers +$(BUILDDIR)/HiprandRandomNumberKernel.o: CXXFLAGS += $(HIPINC) +endif # Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592) ifneq ($(shell $(CXX) --version | egrep '^(Intel)'),) @@ -673,8 +725,8 @@ endif # Target (and build rules): C++ and CUDA standalone executables $(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o - $(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CURANDLIBFLAGS) +$(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(BUILDDIR)/HiprandRandomNumberKernel.o + $(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(BUILDDIR)/HiprandRandomNumberKernel.o $(RNDLIBFLAGS) ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) @@ -684,8 +736,8 @@ else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(cu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif $(cu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(cu_main): $(BUILDDIR)/check_sa_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_cu.o - $(GPUCC) -o $@ $(BUILDDIR)/check_sa_cu.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_cu.o $(CURANDLIBFLAGS) +$(cu_main): $(BUILDDIR)/check_sa_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_cu.o $(BUILDDIR)/HiprandRandomNumberKernel_cu.o + $(GPUCC) -o $@ $(BUILDDIR)/check_sa_cu.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_cu.o $(BUILDDIR)/HiprandRandomNumberKernel_cu.o $(RNDLIBFLAGS) endif #------------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/runTest.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/runTest.cc index de327f2321..7f248d29a4 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/runTest.cc +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/runTest.cc @@ -238,7 +238,7 @@ struct CUDATest : public CUDA_CPU_TestBase return MemoryAccessMatrixElements::ieventAccessConst( hstMatrixElements.data(), ievt ); } }; -#endif +#endif /* clang-format off */ // Use two levels of macros to force stringification at the right level // (see https://gcc.gnu.org/onlinedocs/gcc-3.0.1/cpp_3.html#SEC17 and https://stackoverflow.com/a/3419392) @@ -260,4 +260,4 @@ INSTANTIATE_TEST_SUITE_P( prefix, \ MG_INSTANTIATE_TEST_SUITE_GPU( XTESTID_GPU( MG_EPOCH_PROCESS_ID ), MadgraphTest ); #else MG_INSTANTIATE_TEST_SUITE_CPU( XTESTID_CPU( MG_EPOCH_PROCESS_ID ), MadgraphTest ); -#endif +#endif /* clang-format on */ diff --git a/epochX/cudacpp/gg_ttgg.sa/src/cudacpp_src.mk b/epochX/cudacpp/gg_ttgg.sa/src/cudacpp_src.mk index b2b9da5288..d7d51259d2 100644 --- a/epochX/cudacpp/gg_ttgg.sa/src/cudacpp_src.mk +++ b/epochX/cudacpp/gg_ttgg.sa/src/cudacpp_src.mk @@ -45,13 +45,13 @@ endif #------------------------------------------------------------------------------- -#=== Configure the CUDA compiler (note: GPUCC is already exported including ccache) +#=== Configure the CUDA compiler (note: GPUCC have been exported from cudacpp.mk including ccache) ###$(info GPUCC=$(GPUCC)) #------------------------------------------------------------------------------- -#=== Configure ccache for C++ builds (note: GPUCC is already exported including ccache) +#=== Configure ccache for C++ builds (note: GPUCC have been exported from cudacpp.mk including ccache) # Enable ccache if USECCACHE=1 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) @@ -86,7 +86,8 @@ endif #------------------------------------------------------------------------------- -#=== Set the CUDA/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN +#=== Set the CUDA/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD (exported from cudacpp.mk) +#=== (NB the RNDCXXFLAGS and RNDLIBFLAGS appropriate to user-defined choices of HASCURAND and HASHIPRAND have been exported from cudacpp.mk) # Set the build flags appropriate to OMPFLAGS ###$(info OMPFLAGS=$(OMPFLAGS)) @@ -175,14 +176,6 @@ else ifneq ($(HRDCOD),0) $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported) endif -# Set the build flags appropriate to each RNDGEN choice (example: "make RNDGEN=hasNoCurand") -###$(info RNDGEN=$(RNDGEN)) -ifeq ($(RNDGEN),hasNoCurand) - CXXFLAGS += -DMGONGPU_HAS_NO_CURAND -else ifneq ($(RNDGEN),hasCurand) - $(error Unknown RNDGEN='$(RNDGEN)': only 'hasCurand' and 'hasNoCurand' are supported) -endif - #------------------------------------------------------------------------------- #=== Configure build directories and build lockfiles === @@ -193,7 +186,7 @@ override DIRTAG = $(AVX)_$(FPTYPE)_inl$(HELINL)_hrd$(HRDCOD) # Build lockfile "full" tag (defines full specification of build options that cannot be intermixed) # (Rationale: avoid mixing of CUDA and no-CUDA environment builds with different random number generators) -override TAG = $(AVX)_$(FPTYPE)_inl$(HELINL)_hrd$(HRDCOD)_$(RNDGEN) +override TAG = $(AVX)_$(FPTYPE)_inl$(HELINL)_hrd$(HRDCOD)_$(HASCURAND)_$(HASHIPRAND) # Build directory: current directory by default, or build.$(DIRTAG) if USEBUILDDIR==1 ###$(info Current directory is $(shell pwd)) diff --git a/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuConfig.h index 475749ca7c..22d6921fda 100644 --- a/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuConfig.h @@ -20,11 +20,15 @@ #undef MGONGPUCPP_GPUIMPL #endif +// Make sure that __HIP_PLATFORM_NVIDIA__ is undefined +// (__HIP_PLATFORM_AMD__ is defined by hipcc or in HiprandRandomNumberKernel.cc) +#undef __HIP_PLATFORM_NVIDIA__ // disable hiprand for NVidia (curand) + // ** NB1 Throughputs (e.g. 6.8E8) are events/sec for "./gcheck.exe -p 65536 128 12" // ** NB2 Baseline on b7g47n0004 fluctuates (probably depends on load on other VMs) // Choose if curand is supported for generating random numbers -// For HIP, by default, do not use curand (common random numbers will be used instead) +// For HIP, by default, do not allow curand to be used (hiprand or common random numbers will be used instead) // For both CUDA and C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND // (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784 and #785) #if defined __HIPCC__ @@ -39,6 +43,22 @@ //#endif #endif +// Choose if hiprand is supported for generating random numbers +// For CUDA, by default, do not allow hiprand to be used (curand or common random numbers will be used instead) +// For both HIP and C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_HIPRAND +// (there may exist HIP installations which do not include hiprand?) +#if defined __CUDACC__ +#define MGONGPU_HAS_NO_HIPRAND 1 +#else +//#ifdef __HIPCC__ +//#undef MGONGPU_HAS_NO_HIPRAND // default +////#define MGONGPU_HAS_NO_HIPRAND 1 +//#else +//#undef MGONGPU_HAS_NO_HIPRAND // default +////#define MGONGPU_HAS_NO_HIPRAND 1 +//#endif +#endif + // Choose floating point precision (for everything but color algebra #537) // If one of these macros has been set from outside with e.g. -DMGONGPU_FPTYPE_FLOAT, nothing happens (issue #167) #if not defined MGONGPU_FPTYPE_DOUBLE and not defined MGONGPU_FPTYPE_FLOAT diff --git a/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt b/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt index e87e82777e..df59413576 100644 --- a/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt +++ b/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ g g g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005635499954223633  +DEBUG: model prefixing takes 0.0055010318756103516  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=5: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g g g WEIGHTED<=5 @1 INFO: Process has 1240 diagrams -1 processes with 1240 diagrams generated in 1.913 s +1 processes with 1240 diagrams generated in 1.859 s Total: 1 processes with 1240 diagrams output madevent ../TMPOUT/CODEGEN_mad_gg_ttggg --hel_recycling=False --vector_size=32 --me_exporter=standalone_cudacpp Load PLUGIN.CUDACPP_OUTPUT @@ -164,10 +164,10 @@ It has been validated for the last time with version: 3.5.2 Addition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT Output will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: cformat =  standalone_cudacpp [export_cpp.py at line 3071]  -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 160]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 161]  INFO: initialize a new directory: CODEGEN_mad_gg_ttggg INFO: remove old information in CODEGEN_mad_gg_ttggg -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 165]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 166]  WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg  INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards  @@ -179,7 +179,7 @@ INFO: Creating files in directory P1_gg_ttxggg INFO: Computing Color-Flow optimization [15120 term] INFO: Color-Flow passed to 1630 term in 8s. Introduce 3030 contraction DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1057]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -194,23 +194,23 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1871]  INFO: Generating Feynman diagrams for Process: g g > t t~ g g g WEIGHTED<=5 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttxggg -Generated helas calls for 1 subprocesses (1240 diagrams) in 6.741 s -Wrote files for 2281 helas calls in 19.148 s +Generated helas calls for 1 subprocesses (1240 diagrams) in 6.577 s +Wrote files for 2281 helas calls in 18.096 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.327 s -DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202]  +ALOHA: aloha creates 5 routines in 0.313 s +DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 203]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 10 routines in 0.322 s +ALOHA: aloha creates 10 routines in 0.305 s VVV1 VVV1 FFV1 @@ -252,17 +252,17 @@ Hunk #2 succeeded at 255 (offset 112 lines). Hunk #3 succeeded at 333 (offset 112 lines). Hunk #4 succeeded at 361 (offset 112 lines). Hunk #5 succeeded at 406 (offset 112 lines). -DEBUG: p.returncode =  0 [output.py at line 237]  +DEBUG: p.returncode =  0 [output.py at line 238]  Output to directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg done. Type "launch" to generate events from this process, or see /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/README Run "open index.html" to see more information about this process. quit -real 0m30.124s -user 0m29.560s -sys 0m0.444s -Code generation completed in 31 seconds +real 0m28.769s +user 0m28.239s +sys 0m0.414s +Code generation completed in 29 seconds ************************************************************ * * * W E L C O M E to * diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/CurandRandomNumberKernel.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/CurandRandomNumberKernel.cc index 08a16f6f2c..c160c5e06b 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/CurandRandomNumberKernel.cc +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/CurandRandomNumberKernel.cc @@ -10,6 +10,7 @@ #include #ifndef MGONGPU_HAS_NO_CURAND /* clang-format off */ +// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when MGONGPUCPP_GPUIMPL is not defined #include "curand.h" #define checkCurand( code ){ assertCurand( code, __FILE__, __LINE__ ); } inline void assertCurand( curandStatus_t code, const char *file, int line, bool abort = true ) diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/HiprandRandomNumberKernel.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/HiprandRandomNumberKernel.cc new file mode 100644 index 0000000000..2e4534f9d4 --- /dev/null +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/HiprandRandomNumberKernel.cc @@ -0,0 +1,145 @@ +// Copyright (C) 2020-2024 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Jan 2024) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2024) for the MG5aMC CUDACPP plugin. + +#include "mgOnGpuConfig.h" + +#include "GpuRuntime.h" +#include "MemoryBuffers.h" +#include "RandomNumberKernels.h" + +#include + +#ifndef MGONGPU_HAS_NO_HIPRAND /* clang-format off */ +#ifndef __HIP_PLATFORM_AMD__ +#define __HIP_PLATFORM_AMD__ 1 // enable hiprand for AMD (rocrand) +#endif +#include +#define checkHiprand( code ){ assertHiprand( code, __FILE__, __LINE__ ); } +inline void assertHiprand( hiprandStatus_t code, const char *file, int line, bool abort = true ) +{ + if ( code != HIPRAND_STATUS_SUCCESS ) + { + printf( "HiprandAssert: %s:%d code=%d\n", file, line, code ); + if ( abort ) assert( code == HIPRAND_STATUS_SUCCESS ); + } +} +#endif /* clang-format on */ + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + //-------------------------------------------------------------------------- +#ifndef MGONGPU_HAS_NO_HIPRAND + HiprandRandomNumberKernel::HiprandRandomNumberKernel( BufferRndNumMomenta& rnarray, const bool onDevice ) + : RandomNumberKernelBase( rnarray ) + , m_isOnDevice( onDevice ) + { + if( m_isOnDevice ) + { +#ifdef MGONGPUCPP_GPUIMPL + if( !m_rnarray.isOnDevice() ) + throw std::runtime_error( "HiprandRandomNumberKernel on device with a host random number array" ); +#else + throw std::runtime_error( "HiprandRandomNumberKernel does not support HiprandDevice on CPU host" ); +#endif + } + else + { + if( m_rnarray.isOnDevice() ) + throw std::runtime_error( "HiprandRandomNumberKernel on host with a device random number array" ); + } + createGenerator(); + } + + //-------------------------------------------------------------------------- + + HiprandRandomNumberKernel::~HiprandRandomNumberKernel() + { + destroyGenerator(); + } + + //-------------------------------------------------------------------------- + + void HiprandRandomNumberKernel::seedGenerator( const unsigned int seed ) + { + if( m_isOnDevice ) + { + destroyGenerator(); // workaround for #429 + createGenerator(); // workaround for #429 + } + //printf( "seedGenerator: seed %d\n", seed ); + checkHiprand( hiprandSetPseudoRandomGeneratorSeed( m_rnGen, seed ) ); + } + + //-------------------------------------------------------------------------- + + void HiprandRandomNumberKernel::createGenerator() + { + //const hiprandRngType_t type = HIPRAND_RNG_PSEUDO_DEFAULT; + //const hiprandRngType_t type = HIPRAND_RNG_PSEUDO_XORWOW; + //const hiprandRngType_t type = HIPRAND_RNG_PSEUDO_MRG32K3A; + const hiprandRngType_t type = HIPRAND_RNG_PSEUDO_MTGP32; // same as curand; not implemented yet (code=1000) in host code + //const hiprandRngType_t type = HIPRAND_RNG_PSEUDO_MT19937; + //const hiprandRngType_t type = HIPRAND_RNG_PSEUDO_PHILOX4_32_10; + if( m_isOnDevice ) + { + checkHiprand( hiprandCreateGenerator( &m_rnGen, type ) ); + } + else + { + // See https://github.com/ROCm/hipRAND/issues/76 + throw std::runtime_error( "HiprandRandomNumberKernel on host is not supported yet (hiprandCreateGeneratorHost is not implemented yet)" ); + //checkHiprand( hiprandCreateGeneratorHost( &m_rnGen, type ) ); // ALWAYS FAILS WITH CODE=1000 + } + // FIXME: hiprand ordering is not implemented yet + // See https://github.com/ROCm/hipRAND/issues/75 + /* + //checkHiprand( hiprandSetGeneratorOrdering( *&m_rnGen, HIPRAND_ORDERING_PSEUDO_LEGACY ) ); + checkHiprand( hiprandSetGeneratorOrdering( *&m_rnGen, HIPRAND_ORDERING_PSEUDO_BEST ) ); + //checkHiprand( hiprandSetGeneratorOrdering( *&m_rnGen, HIPRAND_ORDERING_PSEUDO_DYNAMIC ) ); + //checkHiprand( hiprandSetGeneratorOrdering( *&m_rnGen, HIPRAND_ORDERING_PSEUDO_SEEDED ) ); + */ + } + + //-------------------------------------------------------------------------- + + void HiprandRandomNumberKernel::destroyGenerator() + { + checkHiprand( hiprandDestroyGenerator( m_rnGen ) ); + } + + //-------------------------------------------------------------------------- + + void HiprandRandomNumberKernel::generateRnarray() + { +#if defined MGONGPU_FPTYPE_DOUBLE + checkHiprand( hiprandGenerateUniformDouble( m_rnGen, m_rnarray.data(), m_rnarray.size() ) ); +#elif defined MGONGPU_FPTYPE_FLOAT + checkHiprand( hiprandGenerateUniform( m_rnGen, m_rnarray.data(), m_rnarray.size() ) ); +#endif + /* + printf( "\nHiprandRandomNumberKernel::generateRnarray size = %d\n", (int)m_rnarray.size() ); + fptype* data = m_rnarray.data(); +#ifdef MGONGPUCPP_GPUIMPL + if( m_rnarray.isOnDevice() ) + { + data = new fptype[m_rnarray.size()](); + checkCuda( cudaMemcpy( data, m_rnarray.data(), m_rnarray.bytes(), cudaMemcpyDeviceToHost ) ); + } +#endif + for( int i = 0; i < ( (int)m_rnarray.size() / 4 ); i++ ) + printf( "[%4d] %f %f %f %f\n", i * 4, data[i * 4], data[i * 4 + 2], data[i * 4 + 2], data[i * 4 + 3] ); +#ifdef MGONGPUCPP_GPUIMPL + if( m_rnarray.isOnDevice() ) delete[] data; +#endif + */ + } + + //-------------------------------------------------------------------------- +#endif +} diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/HiprandRandomNumberKernel.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/HiprandRandomNumberKernel.cc new file mode 120000 index 0000000000..6691864f78 --- /dev/null +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/HiprandRandomNumberKernel.cc @@ -0,0 +1 @@ +../HiprandRandomNumberKernel.cc \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/check_sa.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/check_sa.cc index bde384c69e..e086ae12d9 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/check_sa.cc +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/check_sa.cc @@ -1,10 +1,10 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2023 CERN and UCLouvain. +// Copyright (C) 2020-2024 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -58,7 +58,7 @@ int usage( char* argv0, int ret = 1 ) { std::cout << "Usage: " << argv0 - << " [--verbose|-v] [--debug|-d] [--performance|-p] [--json|-j] [--curhst|--curdev|--common] [--rmbhst|--rmbdev] [--bridge]" + << " [--verbose|-v] [--debug|-d] [--performance|-p] [--json|-j] [--curhst|--curdev|--hirhst|--hirdev|--common] [--rmbhst|--rmbdev] [--bridge]" << " [#gpuBlocksPerGrid #gpuThreadsPerBlock] #iterations" << std::endl; std::cout << std::endl; std::cout << "The number of events per iteration is #gpuBlocksPerGrid * #gpuThreadsPerBlock" << std::endl; @@ -131,17 +131,31 @@ main( int argc, char** argv ) enum class RandomNumberMode { CommonRandom = 0, - CurandHost = 1, - CurandDevice = 2 + CurandHost = -1, + CurandDevice = 1, + HiprandHost = -2, + HiprandDevice = 2 }; -#ifdef MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785) -#elif defined __HIPCC__ -#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random -#elif defined __CUDACC__ +#if defined __CUDACC__ +#ifndef MGONGPU_HAS_NO_CURAND RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand #else + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on NVidia GPU if build has no curand (PR #784 and #785) +#endif +#elif defined __HIPCC__ +#ifndef MGONGPU_HAS_NO_HIPRAND + RandomNumberMode rndgen = RandomNumberMode::HiprandDevice; // default on AMD GPU if build has hiprand +#else + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on AMD GPU if build has no hiprand +#endif +#else +#ifndef MGONGPU_HAS_NO_CURAND RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand +#elif not defined MGONGPU_HAS_NO_HIPRAND + RandomNumberMode rndgen = RandomNumberMode::HiprandDevice; // default on CPU if build has hiprand +#else + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has neither curand nor hiprand +#endif #endif // Rambo sampling mode (NB RamboHost implies CommonRandom or CurandHost!) enum class RamboSamplingMode @@ -152,7 +166,7 @@ main( int argc, char** argv ) #ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -193,6 +207,26 @@ main( int argc, char** argv ) throw std::runtime_error( "CurandHost is not supported because this application was built without Curand support" ); #else rndgen = RandomNumberMode::CurandHost; +#endif + } + else if( arg == "--hirdev" ) + { +#ifndef __HIPCC__ + throw std::runtime_error( "HiprandDevice is not supported on CPUs or non-AMD GPUs" ); +#elif defined MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "HiprandDevice is not supported because this application was built without Hiprand support" ); +#else + rndgen = RandomNumberMode::HiprandDevice; +#endif + } + else if( arg == "--hirhst" ) + { +#ifdef MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "HiprandHost is not supported because this application was built without Hiprand support" ); +#else + // See https://github.com/ROCm/hipRAND/issues/76 + throw std::runtime_error( "HiprandRandomNumberKernel on host is not supported yet (hiprandCreateGeneratorHost is not implemented yet)" ); + //rndgen = RandomNumberMode::HiprandHost; #endif } else if( arg == "--common" ) @@ -265,6 +299,20 @@ main( int argc, char** argv ) #endif } + if( rmbsmp == RamboSamplingMode::RamboHost && rndgen == RandomNumberMode::HiprandDevice ) + { +#if not defined MGONGPU_HAS_NO_HIPRAND + // See https://github.com/ROCm/hipRAND/issues/76 + //std::cout << "WARNING! RamboHost selected: cannot use HiprandDevice, will use HiprandHost" << std::endl; + //rndgen = RandomNumberMode::HiprandHost; + std::cout << "WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet)" << std::endl; + rndgen = RandomNumberMode::CommonRandom; +#else + std::cout << "WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom" << std::endl; + rndgen = RandomNumberMode::CommonRandom; +#endif + } + constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout constexpr int neppR = MemoryAccessRandomNumbers::neppR; // AOSOA layout @@ -415,7 +463,7 @@ main( int argc, char** argv ) std::unique_ptr wavetimes( new double[niter] ); std::unique_ptr wv3atimes( new double[niter] ); - // --- 0c. Create curand or common generator + // --- 0c. Create curand, hiprand or common generator const std::string cgenKey = "0c GenCreat"; timermap.start( cgenKey ); // Allocate the appropriate RandomNumberKernel @@ -433,7 +481,7 @@ main( int argc, char** argv ) prnk.reset( new CurandRandomNumberKernel( hstRndmom, onDevice ) ); #endif } - else + else if( rndgen == RandomNumberMode::CurandDevice ) { #ifdef MGONGPU_HAS_NO_CURAND throw std::runtime_error( "INTERNAL ERROR! CurandDevice is not supported because this application was built without Curand support" ); // INTERNAL ERROR (no path to this statement) @@ -441,9 +489,31 @@ main( int argc, char** argv ) const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); #else - throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif } + else if( rndgen == RandomNumberMode::HiprandHost ) + { +#ifdef MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "INTERNAL ERROR! HiprandHost is not supported because this application was built without Hiprand support" ); // INTERNAL ERROR (no path to this statement) +#else + const bool onDevice = false; + prnk.reset( new HiprandRandomNumberKernel( hstRndmom, onDevice ) ); +#endif + } + else if( rndgen == RandomNumberMode::HiprandDevice ) + { +#ifdef MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "INTERNAL ERROR! HiprandDevice is not supported because this application was built without Hiprand support" ); // INTERNAL ERROR (no path to this statement) +#elif defined __HIPCC__ + const bool onDevice = true; + prnk.reset( new HiprandRandomNumberKernel( devRndmom, onDevice ) ); +#else + throw std::logic_error( "INTERNAL ERROR! HiprandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) +#endif + } + else + throw std::logic_error( "INTERNAL ERROR! Unknown rndgen value?" ); // INTERNAL ERROR (no path to this statement) // --- 0c. Create rambo sampling kernel [keep this in 0c for the moment] std::unique_ptr prsk; @@ -497,7 +567,7 @@ main( int argc, char** argv ) // *** START THE OLD-STYLE TIMER FOR RANDOM GEN *** double genrtime = 0; - // --- 1a. Seed rnd generator (to get same results on host and device in curand) + // --- 1a. Seed rnd generator (to get same results on host and device in curand/hiprand) // [NB This should not be necessary using the host API: "Generation functions // can be called multiple times on the same generator to generate successive // blocks of results. For pseudorandom generators, multiple calls to generation @@ -515,7 +585,9 @@ main( int argc, char** argv ) //std::cout << "Got random numbers" << std::endl; #ifdef MGONGPUCPP_GPUIMPL - if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) + if( rndgen != RandomNumberMode::CurandDevice && + rndgen != RandomNumberMode::HiprandDevice && + rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device const std::string htodKey = "1c CpHTDrnd"; @@ -761,6 +833,10 @@ main( int argc, char** argv ) rndgentxt = "CURAND HOST"; else if( rndgen == RandomNumberMode::CurandDevice ) rndgentxt = "CURAND DEVICE"; + else if( rndgen == RandomNumberMode::HiprandHost ) + rndgentxt = "ROCRAND HOST"; + else if( rndgen == RandomNumberMode::HiprandDevice ) + rndgentxt = "ROCRAND DEVICE"; #ifdef __CUDACC__ rndgentxt += " (CUDA code)"; #elif defined __HIPCC__ @@ -788,7 +864,7 @@ main( int argc, char** argv ) wrkflwtxt += "FLT+"; #else wrkflwtxt += "???+"; // no path to this statement -#endif /* clang-format on */ +#endif // -- CUCOMPLEX or THRUST or STD or CXSIMPLE complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -813,7 +889,7 @@ main( int argc, char** argv ) wrkflwtxt += "CXS:"; #else wrkflwtxt += "???:"; // no path to this statement -#endif +#endif /* clang-format on */ #endif // -- COMMON or CURAND HOST or CURAND DEVICE random numbers? if( rndgen == RandomNumberMode::CommonRandom ) @@ -822,6 +898,10 @@ main( int argc, char** argv ) wrkflwtxt += "CURHST+"; else if( rndgen == RandomNumberMode::CurandDevice ) wrkflwtxt += "CURDEV+"; + else if( rndgen == RandomNumberMode::HiprandHost ) + wrkflwtxt += "HIRHST+"; + else if( rndgen == RandomNumberMode::HiprandDevice ) + wrkflwtxt += "HIRDEV+"; else wrkflwtxt += "??????+"; // no path to this statement // -- HOST or DEVICE rambo sampling? @@ -1099,7 +1179,7 @@ main( int argc, char** argv ) #ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif - << "\"Curand generation\": " + << "\"Random generation\": " << "\"" << rndgentxt << "\"," << std::endl; double minelem = hstStats.minME; diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/RandomNumberKernels.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/RandomNumberKernels.h index 21d63beeac..7ed728a26c 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/RandomNumberKernels.h +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/RandomNumberKernels.h @@ -1,21 +1,22 @@ -// Copyright (C) 2020-2023 CERN and UCLouvain. +// Copyright (C) 2020-2024 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. #ifndef RANDOMNUMBERKERNELS_H #define RANDOMNUMBERKERNELS_H 1 #include "mgOnGpuConfig.h" -// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when MGONGPUCPP_GPUIMPL is not defined -#ifndef MGONGPU_HAS_NO_CURAND -//#include "curand.h" -struct curandGenerator_st; // forward definition from curand.h -#endif - #include "MemoryBuffers.h" +// Forward definition from curand.h (the full header is only needed in CurandRandomKernel.cc) +struct curandGenerator_st; + +// Forward definition from hiprand.h (the full header is only needed in HiprandRandomKernel.cc) +struct rocrand_generator_base_type; +typedef rocrand_generator_base_type hiprandGenerator_st; + #ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else @@ -107,7 +108,6 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef MGONGPU_HAS_NO_CURAND // A class encapsulating CURAND random number generation on a CPU host or on a GPU device class CurandRandomNumberKernel final : public RandomNumberKernelBase { @@ -142,11 +142,49 @@ namespace mg5amcCpu const bool m_isOnDevice; // The curand generator - // (NB: curand.h defines typedef generator_t as a pointer to forward-defined 'struct curandGenerator_st') + // (NB: curand.h defines typedef curandGenerator_t as a pointer to forward-defined 'struct curandGenerator_st') curandGenerator_st* m_rnGen; }; -#endif + //-------------------------------------------------------------------------- + + // A class encapsulating HIPRAND random number generation on a CPU host or on a GPU device + class HiprandRandomNumberKernel final : public RandomNumberKernelBase + { + public: + + // Constructor from an existing output buffer + HiprandRandomNumberKernel( BufferRndNumMomenta& rnarray, const bool onDevice ); + + // Destructor + ~HiprandRandomNumberKernel(); + + // Seed the random number generator + void seedGenerator( const unsigned int seed ) override final; + + // Generate the random number array + void generateRnarray() override final; + + // Is this a host or device kernel? + bool isOnDevice() const override final { return m_isOnDevice; } + + private: + + // Create the generator (workaround for #429: do this in every seedGenerator call rather than only in the ctor) + void createGenerator(); + + // Destroy the generator (workaround for #429: do this in every seedGenerator call rather than only in the ctor) + void destroyGenerator(); + + private: + + // Is this a host or device kernel? + const bool m_isOnDevice; + + // The hiprand generator + // (NB: hiprand.h defines typedef hiprandGenerator_t as a pointer to forward-defined 'struct hiprandGenerator_st') + hiprandGenerator_st* m_rnGen; + }; //-------------------------------------------------------------------------- } diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk index 1077bdc098..3ad91dfd59 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk @@ -173,11 +173,6 @@ ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) comma:=, CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch)) CUINC = -I$(CUDA_HOME)/include/ - ifeq ($(RNDGEN),hasNoCurand) - CURANDLIBFLAGS= - else - CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here! - endif CUOPTFLAGS = -lineinfo ###GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math GPUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math @@ -241,7 +236,7 @@ else override GPUCC= override USE_NVTX= override CUINC= - override CURANDLIBFLAGS= + override HIPINC= endif @@ -291,7 +286,7 @@ endif #------------------------------------------------------------------------------- -#=== Configure defaults and check if user-defined choices exist for OMPFLAGS, AVX, FPTYPE, HELINL, HRDCOD, RNDGEN +#=== Configure defaults and check if user-defined choices exist for OMPFLAGS, AVX, FPTYPE, HELINL, HRDCOD # Set the default OMPFLAGS choice ifneq ($(findstring hipcc,$(GPUCC)),) @@ -352,29 +347,62 @@ ifeq ($(HRDCOD),) override HRDCOD = 0 endif -# Set the default RNDGEN (random number generator) choice -ifeq ($(RNDGEN),) - ifeq ($(GPUCC),) - override RNDGEN = hasNoCurand - # Edgecase for HIP compilation - else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) - override RNDGEN = hasNoCurand - else ifeq ($(RNDGEN),) - override RNDGEN = hasCurand - endif -endif - -# Export AVX, FPTYPE, HELINL, HRDCOD, RNDGEN, OMPFLAGS so that it is not necessary to pass them to the src Makefile too +# Export AVX, FPTYPE, HELINL, HRDCOD, OMPFLAGS so that it is not necessary to pass them to the src Makefile too export AVX export FPTYPE export HELINL export HRDCOD -export RNDGEN export OMPFLAGS #------------------------------------------------------------------------------- -#=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN +#=== Configure defaults and check if user-defined choices exist for RNDGEN (legacy!), HASCURAND, HASHIPRAND + +# If the legacy RNDGEN exists, this take precedence over any HASCURAND choice (but a warning is printed out) +###$(info RNDGEN=$(RNDGEN)) +ifneq ($(RNDGEN),) + $(warning Environment variable RNDGEN is no longer supported, please use HASCURAND instead!) + ifeq ($(RNDGEN),hasCurand) + override HASCURAND = $(RNDGEN) + else ifeq ($(RNDGEN),hasNoCurand) + override HASCURAND = $(RNDGEN) + else ifneq ($(RNDGEN),hasNoCurand) + $(error Unknown RNDGEN='$(RNDGEN)': only 'hasCurand' and 'hasNoCurand' are supported - but use HASCURAND instead!) + endif +endif + +# Set the default HASCURAND (curand random number generator) choice, if no prior choice exists for HASCURAND +# (NB: allow HASCURAND=hasCurand even if $(GPUCC) does not point to nvcc: assume CUDA_HOME was defined correctly...) +ifeq ($(HASCURAND),) + ifeq ($(GPUCC),) # CPU-only build + override HASCURAND = hasNoCurand + else ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build + override HASCURAND = hasCurand + else # non-Nvidia GPU build + override HASCURAND = hasNoCurand + endif +endif + +# Set the default HASHIPRAND (hiprand random number generator) choice, if no prior choice exists for HASHIPRAND +# (NB: allow HASHIPRAND=hasHiprand even if $(GPUCC) does not point to hipcc: assume HIP_HOME was defined correctly...) +ifeq ($(HASHIPRAND),) + ifeq ($(GPUCC),) # CPU-only build + override HASHIPRAND = hasNoHiprand + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build + override HASHIPRAND = hasHiprand + else # non-AMD GPU build + override HASHIPRAND = hasNoHiprand + endif +endif + +# Export HASCURAND, HASHIPRAND so that it is not necessary to pass them to the src Makefile too +# (NB: these variables in cudacpp_src.mk are only used to define the build tag, they are NOT needed for RNDCXXFLAGS or RNDLIBFLAGS) +export HASCURAND +export HASHIPRAND + +#------------------------------------------------------------------------------- + +#=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD # Set the build flags appropriate to OMPFLAGS $(info OMPFLAGS=$(OMPFLAGS)) @@ -432,13 +460,13 @@ CXXFLAGS+= $(AVXFLAGS) $(info FPTYPE=$(FPTYPE)) ifeq ($(FPTYPE),d) CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE - GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE + GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE else ifeq ($(FPTYPE),f) CXXFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT - GPUFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT + GPUFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT else ifeq ($(FPTYPE),m) CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT - GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT + GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT else $(error Unknown FPTYPE='$(FPTYPE)': only 'd', 'f' and 'm' are supported) endif @@ -447,7 +475,7 @@ endif $(info HELINL=$(HELINL)) ifeq ($(HELINL),1) CXXFLAGS += -DMGONGPU_INLINE_HELAMPS - GPUFLAGS += -DMGONGPU_INLINE_HELAMPS + GPUFLAGS += -DMGONGPU_INLINE_HELAMPS else ifneq ($(HELINL),0) $(error Unknown HELINL='$(HELINL)': only '0' and '1' are supported) endif @@ -456,21 +484,40 @@ endif $(info HRDCOD=$(HRDCOD)) ifeq ($(HRDCOD),1) CXXFLAGS += -DMGONGPU_HARDCODE_PARAM - GPUFLAGS += -DMGONGPU_HARDCODE_PARAM + GPUFLAGS += -DMGONGPU_HARDCODE_PARAM else ifneq ($(HRDCOD),0) $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported) endif -# Set the build flags appropriate to each RNDGEN choice (example: "make RNDGEN=hasNoCurand") -$(info RNDGEN=$(RNDGEN)) -ifeq ($(RNDGEN),hasNoCurand) - override CXXFLAGSCURAND = -DMGONGPU_HAS_NO_CURAND -else ifeq ($(RNDGEN),hasCurand) - override CXXFLAGSCURAND = + +#=== Set the CUDA/HIP/C++ compiler and linker flags appropriate to user-defined choices of HASCURAND, HASHIPRAND + +$(info HASCURAND=$(HASCURAND)) +$(info HASHIPRAND=$(HASHIPRAND)) +override RNDCXXFLAGS= +override RNDLIBFLAGS= + +# Set the RNDCXXFLAGS and RNDLIBFLAGS build flags appropriate to each HASCURAND choice (example: "make HASCURAND=hasNoCurand") +ifeq ($(HASCURAND),hasNoCurand) + override RNDCXXFLAGS += -DMGONGPU_HAS_NO_CURAND +else ifeq ($(HASCURAND),hasCurand) + override RNDLIBFLAGS += -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here! else - $(error Unknown RNDGEN='$(RNDGEN)': only 'hasCurand' and 'hasNoCurand' are supported) + $(error Unknown HASCURAND='$(HASCURAND)': only 'hasCurand' and 'hasNoCurand' are supported) +endif + +# Set the RNDCXXFLAGS and RNDLIBFLAGS build flags appropriate to each HASHIPRAND choice (example: "make HASHIPRAND=hasNoHiprand") +ifeq ($(HASHIPRAND),hasNoHiprand) + override RNDCXXFLAGS += -DMGONGPU_HAS_NO_HIPRAND +else ifeq ($(HASHIPRAND),hasHiprand) + override RNDLIBFLAGS += -L$(HIP_HOME)/lib/ -lhiprand +else ifneq ($(HASHIPRAND),hasHiprand) + $(error Unknown HASHIPRAND='$(HASHIPRAND)': only 'hasHiprand' and 'hasNoHiprand' are supported) endif +#$(info RNDCXXFLAGS=$(RNDCXXFLAGS)) +#$(info HASHIPRAND=$(HASHIPRAND)) + #------------------------------------------------------------------------------- #=== Configure build directories and build lockfiles === @@ -481,7 +528,7 @@ override DIRTAG = $(AVX)_$(FPTYPE)_inl$(HELINL)_hrd$(HRDCOD) # Build lockfile "full" tag (defines full specification of build options that cannot be intermixed) # (Rationale: avoid mixing of CUDA and no-CUDA environment builds with different random number generators) -override TAG = $(AVX)_$(FPTYPE)_inl$(HELINL)_hrd$(HRDCOD)_$(RNDGEN) +override TAG = $(AVX)_$(FPTYPE)_inl$(HELINL)_hrd$(HRDCOD)_$(HASCURAND)_$(HASHIPRAND) # Build directory: current directory by default, or build.$(DIRTAG) if USEBUILDDIR==1 ifeq ($(USEBUILDDIR),1) @@ -589,14 +636,19 @@ endif $(BUILDDIR)/check_sa.o: CXXFLAGS += $(USE_NVTX) $(CUINC) $(BUILDDIR)/check_sa_cu.o: CXXFLAGS += $(USE_NVTX) $(CUINC) -# Apply special build flags only to check_sa[_cu].o and CurandRandomNumberKernel[_cu].o (curand headers, #679) -$(BUILDDIR)/check_sa.o: CXXFLAGS += $(CXXFLAGSCURAND) -$(BUILDDIR)/check_sa_cu.o: CUFLAGS += $(CXXFLAGSCURAND) -$(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CXXFLAGSCURAND) -$(BUILDDIR)/CurandRandomNumberKernel_cu.o: CUFLAGS += $(CXXFLAGSCURAND) -ifeq ($(RNDGEN),hasCurand) +# Apply special build flags only to check_sa[_cu].o and (Cu|Hip)randRandomNumberKernel[_cu].o +$(BUILDDIR)/check_sa.o: CXXFLAGS += $(RNDCXXFLAGS) +$(BUILDDIR)/check_sa_cu.o: CUFLAGS += $(RNDCXXFLAGS) +$(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(RNDCXXFLAGS) +$(BUILDDIR)/CurandRandomNumberKernel_cu.o: CUFLAGS += $(RNDCXXFLAGS) +$(BUILDDIR)/HiprandRandomNumberKernel.o: CXXFLAGS += $(RNDCXXFLAGS) +$(BUILDDIR)/HiprandRandomNumberKernel_cu.o: CUFLAGS += $(RNDCXXFLAGS) +ifeq ($(HASCURAND),hasCurand) # curand headers, #679 $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUINC) endif +ifeq ($(HASHIPRAND),hasHiprand) # hiprand headers +$(BUILDDIR)/HiprandRandomNumberKernel.o: CXXFLAGS += $(HIPINC) +endif # Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592) ifneq ($(shell $(CXX) --version | egrep '^(Intel)'),) @@ -673,8 +725,8 @@ endif # Target (and build rules): C++ and CUDA standalone executables $(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o - $(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CURANDLIBFLAGS) +$(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(BUILDDIR)/HiprandRandomNumberKernel.o + $(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(BUILDDIR)/HiprandRandomNumberKernel.o $(RNDLIBFLAGS) ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) @@ -684,8 +736,8 @@ else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(cu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif $(cu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(cu_main): $(BUILDDIR)/check_sa_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_cu.o - $(GPUCC) -o $@ $(BUILDDIR)/check_sa_cu.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_cu.o $(CURANDLIBFLAGS) +$(cu_main): $(BUILDDIR)/check_sa_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_cu.o $(BUILDDIR)/HiprandRandomNumberKernel_cu.o + $(GPUCC) -o $@ $(BUILDDIR)/check_sa_cu.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_cu.o $(BUILDDIR)/HiprandRandomNumberKernel_cu.o $(RNDLIBFLAGS) endif #------------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/runTest.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/runTest.cc index de327f2321..7f248d29a4 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/runTest.cc +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/runTest.cc @@ -238,7 +238,7 @@ struct CUDATest : public CUDA_CPU_TestBase return MemoryAccessMatrixElements::ieventAccessConst( hstMatrixElements.data(), ievt ); } }; -#endif +#endif /* clang-format off */ // Use two levels of macros to force stringification at the right level // (see https://gcc.gnu.org/onlinedocs/gcc-3.0.1/cpp_3.html#SEC17 and https://stackoverflow.com/a/3419392) @@ -260,4 +260,4 @@ INSTANTIATE_TEST_SUITE_P( prefix, \ MG_INSTANTIATE_TEST_SUITE_GPU( XTESTID_GPU( MG_EPOCH_PROCESS_ID ), MadgraphTest ); #else MG_INSTANTIATE_TEST_SUITE_CPU( XTESTID_CPU( MG_EPOCH_PROCESS_ID ), MadgraphTest ); -#endif +#endif /* clang-format on */ diff --git a/epochX/cudacpp/gg_ttggg.mad/src/cudacpp_src.mk b/epochX/cudacpp/gg_ttggg.mad/src/cudacpp_src.mk index b2b9da5288..d7d51259d2 100644 --- a/epochX/cudacpp/gg_ttggg.mad/src/cudacpp_src.mk +++ b/epochX/cudacpp/gg_ttggg.mad/src/cudacpp_src.mk @@ -45,13 +45,13 @@ endif #------------------------------------------------------------------------------- -#=== Configure the CUDA compiler (note: GPUCC is already exported including ccache) +#=== Configure the CUDA compiler (note: GPUCC have been exported from cudacpp.mk including ccache) ###$(info GPUCC=$(GPUCC)) #------------------------------------------------------------------------------- -#=== Configure ccache for C++ builds (note: GPUCC is already exported including ccache) +#=== Configure ccache for C++ builds (note: GPUCC have been exported from cudacpp.mk including ccache) # Enable ccache if USECCACHE=1 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) @@ -86,7 +86,8 @@ endif #------------------------------------------------------------------------------- -#=== Set the CUDA/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN +#=== Set the CUDA/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD (exported from cudacpp.mk) +#=== (NB the RNDCXXFLAGS and RNDLIBFLAGS appropriate to user-defined choices of HASCURAND and HASHIPRAND have been exported from cudacpp.mk) # Set the build flags appropriate to OMPFLAGS ###$(info OMPFLAGS=$(OMPFLAGS)) @@ -175,14 +176,6 @@ else ifneq ($(HRDCOD),0) $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported) endif -# Set the build flags appropriate to each RNDGEN choice (example: "make RNDGEN=hasNoCurand") -###$(info RNDGEN=$(RNDGEN)) -ifeq ($(RNDGEN),hasNoCurand) - CXXFLAGS += -DMGONGPU_HAS_NO_CURAND -else ifneq ($(RNDGEN),hasCurand) - $(error Unknown RNDGEN='$(RNDGEN)': only 'hasCurand' and 'hasNoCurand' are supported) -endif - #------------------------------------------------------------------------------- #=== Configure build directories and build lockfiles === @@ -193,7 +186,7 @@ override DIRTAG = $(AVX)_$(FPTYPE)_inl$(HELINL)_hrd$(HRDCOD) # Build lockfile "full" tag (defines full specification of build options that cannot be intermixed) # (Rationale: avoid mixing of CUDA and no-CUDA environment builds with different random number generators) -override TAG = $(AVX)_$(FPTYPE)_inl$(HELINL)_hrd$(HRDCOD)_$(RNDGEN) +override TAG = $(AVX)_$(FPTYPE)_inl$(HELINL)_hrd$(HRDCOD)_$(HASCURAND)_$(HASHIPRAND) # Build directory: current directory by default, or build.$(DIRTAG) if USEBUILDDIR==1 ###$(info Current directory is $(shell pwd)) diff --git a/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuConfig.h index 6bde4466d0..33306bc3e2 100644 --- a/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuConfig.h @@ -20,11 +20,15 @@ #undef MGONGPUCPP_GPUIMPL #endif +// Make sure that __HIP_PLATFORM_NVIDIA__ is undefined +// (__HIP_PLATFORM_AMD__ is defined by hipcc or in HiprandRandomNumberKernel.cc) +#undef __HIP_PLATFORM_NVIDIA__ // disable hiprand for NVidia (curand) + // ** NB1 Throughputs (e.g. 6.8E8) are events/sec for "./gcheck.exe -p 65536 128 12" // ** NB2 Baseline on b7g47n0004 fluctuates (probably depends on load on other VMs) // Choose if curand is supported for generating random numbers -// For HIP, by default, do not use curand (common random numbers will be used instead) +// For HIP, by default, do not allow curand to be used (hiprand or common random numbers will be used instead) // For both CUDA and C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND // (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784 and #785) #if defined __HIPCC__ @@ -39,6 +43,22 @@ //#endif #endif +// Choose if hiprand is supported for generating random numbers +// For CUDA, by default, do not allow hiprand to be used (curand or common random numbers will be used instead) +// For both HIP and C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_HIPRAND +// (there may exist HIP installations which do not include hiprand?) +#if defined __CUDACC__ +#define MGONGPU_HAS_NO_HIPRAND 1 +#else +//#ifdef __HIPCC__ +//#undef MGONGPU_HAS_NO_HIPRAND // default +////#define MGONGPU_HAS_NO_HIPRAND 1 +//#else +//#undef MGONGPU_HAS_NO_HIPRAND // default +////#define MGONGPU_HAS_NO_HIPRAND 1 +//#endif +#endif + // Choose floating point precision (for everything but color algebra #537) // If one of these macros has been set from outside with e.g. -DMGONGPU_FPTYPE_FLOAT, nothing happens (issue #167) #if not defined MGONGPU_FPTYPE_DOUBLE and not defined MGONGPU_FPTYPE_FLOAT diff --git a/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt b/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt index ed04396083..faec804f1b 100644 --- a/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt +++ b/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ g g g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005763530731201172  +DEBUG: model prefixing takes 0.005358695983886719  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=5: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g g g WEIGHTED<=5 @1 INFO: Process has 1240 diagrams -1 processes with 1240 diagrams generated in 1.897 s +1 processes with 1240 diagrams generated in 1.872 s Total: 1 processes with 1240 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_ttggg Load PLUGIN.CUDACPP_OUTPUT @@ -163,29 +163,29 @@ Load PLUGIN.CUDACPP_OUTPUT It has been validated for the last time with version: 3.5.2 Output will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: cformat =  plugin [export_cpp.py at line 3071]  -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 160]  -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 165]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 161]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 166]  INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ g g g WEIGHTED<=5 @1 INFO: Processing color information for process: g g > t t~ g g g @1 -DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 194]  -DEBUG: type(subproc_group)= [output.py at line 195]  -DEBUG: type(fortran_model)= [output.py at line 196]  -DEBUG: type(me)= me=0 [output.py at line 197]  +DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 195]  +DEBUG: type(subproc_group)= [output.py at line 196]  +DEBUG: type(fortran_model)= [output.py at line 197]  +DEBUG: type(me)= me=0 [output.py at line 198]  INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/./CPPProcess.h FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/./CPPProcess.cc INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/. -Generated helas calls for 1 subprocesses (1240 diagrams) in 6.650 s -DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202]  +Generated helas calls for 1 subprocesses (1240 diagrams) in 6.517 s +DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 203]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.357 s +ALOHA: aloha creates 5 routines in 0.346 s VVV1 VVV1 FFV1 @@ -208,7 +208,7 @@ INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/. quit -real 0m13.275s -user 0m13.097s -sys 0m0.099s -Code generation completed in 14 seconds +real 0m12.953s +user 0m12.719s +sys 0m0.135s +Code generation completed in 13 seconds diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/CurandRandomNumberKernel.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/CurandRandomNumberKernel.cc index 08a16f6f2c..c160c5e06b 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/CurandRandomNumberKernel.cc +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/CurandRandomNumberKernel.cc @@ -10,6 +10,7 @@ #include #ifndef MGONGPU_HAS_NO_CURAND /* clang-format off */ +// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when MGONGPUCPP_GPUIMPL is not defined #include "curand.h" #define checkCurand( code ){ assertCurand( code, __FILE__, __LINE__ ); } inline void assertCurand( curandStatus_t code, const char *file, int line, bool abort = true ) diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/HiprandRandomNumberKernel.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/HiprandRandomNumberKernel.cc new file mode 100644 index 0000000000..2e4534f9d4 --- /dev/null +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/HiprandRandomNumberKernel.cc @@ -0,0 +1,145 @@ +// Copyright (C) 2020-2024 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Jan 2024) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2024) for the MG5aMC CUDACPP plugin. + +#include "mgOnGpuConfig.h" + +#include "GpuRuntime.h" +#include "MemoryBuffers.h" +#include "RandomNumberKernels.h" + +#include + +#ifndef MGONGPU_HAS_NO_HIPRAND /* clang-format off */ +#ifndef __HIP_PLATFORM_AMD__ +#define __HIP_PLATFORM_AMD__ 1 // enable hiprand for AMD (rocrand) +#endif +#include +#define checkHiprand( code ){ assertHiprand( code, __FILE__, __LINE__ ); } +inline void assertHiprand( hiprandStatus_t code, const char *file, int line, bool abort = true ) +{ + if ( code != HIPRAND_STATUS_SUCCESS ) + { + printf( "HiprandAssert: %s:%d code=%d\n", file, line, code ); + if ( abort ) assert( code == HIPRAND_STATUS_SUCCESS ); + } +} +#endif /* clang-format on */ + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + //-------------------------------------------------------------------------- +#ifndef MGONGPU_HAS_NO_HIPRAND + HiprandRandomNumberKernel::HiprandRandomNumberKernel( BufferRndNumMomenta& rnarray, const bool onDevice ) + : RandomNumberKernelBase( rnarray ) + , m_isOnDevice( onDevice ) + { + if( m_isOnDevice ) + { +#ifdef MGONGPUCPP_GPUIMPL + if( !m_rnarray.isOnDevice() ) + throw std::runtime_error( "HiprandRandomNumberKernel on device with a host random number array" ); +#else + throw std::runtime_error( "HiprandRandomNumberKernel does not support HiprandDevice on CPU host" ); +#endif + } + else + { + if( m_rnarray.isOnDevice() ) + throw std::runtime_error( "HiprandRandomNumberKernel on host with a device random number array" ); + } + createGenerator(); + } + + //-------------------------------------------------------------------------- + + HiprandRandomNumberKernel::~HiprandRandomNumberKernel() + { + destroyGenerator(); + } + + //-------------------------------------------------------------------------- + + void HiprandRandomNumberKernel::seedGenerator( const unsigned int seed ) + { + if( m_isOnDevice ) + { + destroyGenerator(); // workaround for #429 + createGenerator(); // workaround for #429 + } + //printf( "seedGenerator: seed %d\n", seed ); + checkHiprand( hiprandSetPseudoRandomGeneratorSeed( m_rnGen, seed ) ); + } + + //-------------------------------------------------------------------------- + + void HiprandRandomNumberKernel::createGenerator() + { + //const hiprandRngType_t type = HIPRAND_RNG_PSEUDO_DEFAULT; + //const hiprandRngType_t type = HIPRAND_RNG_PSEUDO_XORWOW; + //const hiprandRngType_t type = HIPRAND_RNG_PSEUDO_MRG32K3A; + const hiprandRngType_t type = HIPRAND_RNG_PSEUDO_MTGP32; // same as curand; not implemented yet (code=1000) in host code + //const hiprandRngType_t type = HIPRAND_RNG_PSEUDO_MT19937; + //const hiprandRngType_t type = HIPRAND_RNG_PSEUDO_PHILOX4_32_10; + if( m_isOnDevice ) + { + checkHiprand( hiprandCreateGenerator( &m_rnGen, type ) ); + } + else + { + // See https://github.com/ROCm/hipRAND/issues/76 + throw std::runtime_error( "HiprandRandomNumberKernel on host is not supported yet (hiprandCreateGeneratorHost is not implemented yet)" ); + //checkHiprand( hiprandCreateGeneratorHost( &m_rnGen, type ) ); // ALWAYS FAILS WITH CODE=1000 + } + // FIXME: hiprand ordering is not implemented yet + // See https://github.com/ROCm/hipRAND/issues/75 + /* + //checkHiprand( hiprandSetGeneratorOrdering( *&m_rnGen, HIPRAND_ORDERING_PSEUDO_LEGACY ) ); + checkHiprand( hiprandSetGeneratorOrdering( *&m_rnGen, HIPRAND_ORDERING_PSEUDO_BEST ) ); + //checkHiprand( hiprandSetGeneratorOrdering( *&m_rnGen, HIPRAND_ORDERING_PSEUDO_DYNAMIC ) ); + //checkHiprand( hiprandSetGeneratorOrdering( *&m_rnGen, HIPRAND_ORDERING_PSEUDO_SEEDED ) ); + */ + } + + //-------------------------------------------------------------------------- + + void HiprandRandomNumberKernel::destroyGenerator() + { + checkHiprand( hiprandDestroyGenerator( m_rnGen ) ); + } + + //-------------------------------------------------------------------------- + + void HiprandRandomNumberKernel::generateRnarray() + { +#if defined MGONGPU_FPTYPE_DOUBLE + checkHiprand( hiprandGenerateUniformDouble( m_rnGen, m_rnarray.data(), m_rnarray.size() ) ); +#elif defined MGONGPU_FPTYPE_FLOAT + checkHiprand( hiprandGenerateUniform( m_rnGen, m_rnarray.data(), m_rnarray.size() ) ); +#endif + /* + printf( "\nHiprandRandomNumberKernel::generateRnarray size = %d\n", (int)m_rnarray.size() ); + fptype* data = m_rnarray.data(); +#ifdef MGONGPUCPP_GPUIMPL + if( m_rnarray.isOnDevice() ) + { + data = new fptype[m_rnarray.size()](); + checkCuda( cudaMemcpy( data, m_rnarray.data(), m_rnarray.bytes(), cudaMemcpyDeviceToHost ) ); + } +#endif + for( int i = 0; i < ( (int)m_rnarray.size() / 4 ); i++ ) + printf( "[%4d] %f %f %f %f\n", i * 4, data[i * 4], data[i * 4 + 2], data[i * 4 + 2], data[i * 4 + 3] ); +#ifdef MGONGPUCPP_GPUIMPL + if( m_rnarray.isOnDevice() ) delete[] data; +#endif + */ + } + + //-------------------------------------------------------------------------- +#endif +} diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/HiprandRandomNumberKernel.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/HiprandRandomNumberKernel.cc new file mode 120000 index 0000000000..6691864f78 --- /dev/null +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/HiprandRandomNumberKernel.cc @@ -0,0 +1 @@ +../HiprandRandomNumberKernel.cc \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/check_sa.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/check_sa.cc index bde384c69e..e086ae12d9 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/check_sa.cc +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/check_sa.cc @@ -1,10 +1,10 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2023 CERN and UCLouvain. +// Copyright (C) 2020-2024 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -58,7 +58,7 @@ int usage( char* argv0, int ret = 1 ) { std::cout << "Usage: " << argv0 - << " [--verbose|-v] [--debug|-d] [--performance|-p] [--json|-j] [--curhst|--curdev|--common] [--rmbhst|--rmbdev] [--bridge]" + << " [--verbose|-v] [--debug|-d] [--performance|-p] [--json|-j] [--curhst|--curdev|--hirhst|--hirdev|--common] [--rmbhst|--rmbdev] [--bridge]" << " [#gpuBlocksPerGrid #gpuThreadsPerBlock] #iterations" << std::endl; std::cout << std::endl; std::cout << "The number of events per iteration is #gpuBlocksPerGrid * #gpuThreadsPerBlock" << std::endl; @@ -131,17 +131,31 @@ main( int argc, char** argv ) enum class RandomNumberMode { CommonRandom = 0, - CurandHost = 1, - CurandDevice = 2 + CurandHost = -1, + CurandDevice = 1, + HiprandHost = -2, + HiprandDevice = 2 }; -#ifdef MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785) -#elif defined __HIPCC__ -#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random -#elif defined __CUDACC__ +#if defined __CUDACC__ +#ifndef MGONGPU_HAS_NO_CURAND RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand #else + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on NVidia GPU if build has no curand (PR #784 and #785) +#endif +#elif defined __HIPCC__ +#ifndef MGONGPU_HAS_NO_HIPRAND + RandomNumberMode rndgen = RandomNumberMode::HiprandDevice; // default on AMD GPU if build has hiprand +#else + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on AMD GPU if build has no hiprand +#endif +#else +#ifndef MGONGPU_HAS_NO_CURAND RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand +#elif not defined MGONGPU_HAS_NO_HIPRAND + RandomNumberMode rndgen = RandomNumberMode::HiprandDevice; // default on CPU if build has hiprand +#else + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has neither curand nor hiprand +#endif #endif // Rambo sampling mode (NB RamboHost implies CommonRandom or CurandHost!) enum class RamboSamplingMode @@ -152,7 +166,7 @@ main( int argc, char** argv ) #ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -193,6 +207,26 @@ main( int argc, char** argv ) throw std::runtime_error( "CurandHost is not supported because this application was built without Curand support" ); #else rndgen = RandomNumberMode::CurandHost; +#endif + } + else if( arg == "--hirdev" ) + { +#ifndef __HIPCC__ + throw std::runtime_error( "HiprandDevice is not supported on CPUs or non-AMD GPUs" ); +#elif defined MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "HiprandDevice is not supported because this application was built without Hiprand support" ); +#else + rndgen = RandomNumberMode::HiprandDevice; +#endif + } + else if( arg == "--hirhst" ) + { +#ifdef MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "HiprandHost is not supported because this application was built without Hiprand support" ); +#else + // See https://github.com/ROCm/hipRAND/issues/76 + throw std::runtime_error( "HiprandRandomNumberKernel on host is not supported yet (hiprandCreateGeneratorHost is not implemented yet)" ); + //rndgen = RandomNumberMode::HiprandHost; #endif } else if( arg == "--common" ) @@ -265,6 +299,20 @@ main( int argc, char** argv ) #endif } + if( rmbsmp == RamboSamplingMode::RamboHost && rndgen == RandomNumberMode::HiprandDevice ) + { +#if not defined MGONGPU_HAS_NO_HIPRAND + // See https://github.com/ROCm/hipRAND/issues/76 + //std::cout << "WARNING! RamboHost selected: cannot use HiprandDevice, will use HiprandHost" << std::endl; + //rndgen = RandomNumberMode::HiprandHost; + std::cout << "WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet)" << std::endl; + rndgen = RandomNumberMode::CommonRandom; +#else + std::cout << "WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom" << std::endl; + rndgen = RandomNumberMode::CommonRandom; +#endif + } + constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout constexpr int neppR = MemoryAccessRandomNumbers::neppR; // AOSOA layout @@ -415,7 +463,7 @@ main( int argc, char** argv ) std::unique_ptr wavetimes( new double[niter] ); std::unique_ptr wv3atimes( new double[niter] ); - // --- 0c. Create curand or common generator + // --- 0c. Create curand, hiprand or common generator const std::string cgenKey = "0c GenCreat"; timermap.start( cgenKey ); // Allocate the appropriate RandomNumberKernel @@ -433,7 +481,7 @@ main( int argc, char** argv ) prnk.reset( new CurandRandomNumberKernel( hstRndmom, onDevice ) ); #endif } - else + else if( rndgen == RandomNumberMode::CurandDevice ) { #ifdef MGONGPU_HAS_NO_CURAND throw std::runtime_error( "INTERNAL ERROR! CurandDevice is not supported because this application was built without Curand support" ); // INTERNAL ERROR (no path to this statement) @@ -441,9 +489,31 @@ main( int argc, char** argv ) const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); #else - throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif } + else if( rndgen == RandomNumberMode::HiprandHost ) + { +#ifdef MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "INTERNAL ERROR! HiprandHost is not supported because this application was built without Hiprand support" ); // INTERNAL ERROR (no path to this statement) +#else + const bool onDevice = false; + prnk.reset( new HiprandRandomNumberKernel( hstRndmom, onDevice ) ); +#endif + } + else if( rndgen == RandomNumberMode::HiprandDevice ) + { +#ifdef MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "INTERNAL ERROR! HiprandDevice is not supported because this application was built without Hiprand support" ); // INTERNAL ERROR (no path to this statement) +#elif defined __HIPCC__ + const bool onDevice = true; + prnk.reset( new HiprandRandomNumberKernel( devRndmom, onDevice ) ); +#else + throw std::logic_error( "INTERNAL ERROR! HiprandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) +#endif + } + else + throw std::logic_error( "INTERNAL ERROR! Unknown rndgen value?" ); // INTERNAL ERROR (no path to this statement) // --- 0c. Create rambo sampling kernel [keep this in 0c for the moment] std::unique_ptr prsk; @@ -497,7 +567,7 @@ main( int argc, char** argv ) // *** START THE OLD-STYLE TIMER FOR RANDOM GEN *** double genrtime = 0; - // --- 1a. Seed rnd generator (to get same results on host and device in curand) + // --- 1a. Seed rnd generator (to get same results on host and device in curand/hiprand) // [NB This should not be necessary using the host API: "Generation functions // can be called multiple times on the same generator to generate successive // blocks of results. For pseudorandom generators, multiple calls to generation @@ -515,7 +585,9 @@ main( int argc, char** argv ) //std::cout << "Got random numbers" << std::endl; #ifdef MGONGPUCPP_GPUIMPL - if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) + if( rndgen != RandomNumberMode::CurandDevice && + rndgen != RandomNumberMode::HiprandDevice && + rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device const std::string htodKey = "1c CpHTDrnd"; @@ -761,6 +833,10 @@ main( int argc, char** argv ) rndgentxt = "CURAND HOST"; else if( rndgen == RandomNumberMode::CurandDevice ) rndgentxt = "CURAND DEVICE"; + else if( rndgen == RandomNumberMode::HiprandHost ) + rndgentxt = "ROCRAND HOST"; + else if( rndgen == RandomNumberMode::HiprandDevice ) + rndgentxt = "ROCRAND DEVICE"; #ifdef __CUDACC__ rndgentxt += " (CUDA code)"; #elif defined __HIPCC__ @@ -788,7 +864,7 @@ main( int argc, char** argv ) wrkflwtxt += "FLT+"; #else wrkflwtxt += "???+"; // no path to this statement -#endif /* clang-format on */ +#endif // -- CUCOMPLEX or THRUST or STD or CXSIMPLE complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -813,7 +889,7 @@ main( int argc, char** argv ) wrkflwtxt += "CXS:"; #else wrkflwtxt += "???:"; // no path to this statement -#endif +#endif /* clang-format on */ #endif // -- COMMON or CURAND HOST or CURAND DEVICE random numbers? if( rndgen == RandomNumberMode::CommonRandom ) @@ -822,6 +898,10 @@ main( int argc, char** argv ) wrkflwtxt += "CURHST+"; else if( rndgen == RandomNumberMode::CurandDevice ) wrkflwtxt += "CURDEV+"; + else if( rndgen == RandomNumberMode::HiprandHost ) + wrkflwtxt += "HIRHST+"; + else if( rndgen == RandomNumberMode::HiprandDevice ) + wrkflwtxt += "HIRDEV+"; else wrkflwtxt += "??????+"; // no path to this statement // -- HOST or DEVICE rambo sampling? @@ -1099,7 +1179,7 @@ main( int argc, char** argv ) #ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif - << "\"Curand generation\": " + << "\"Random generation\": " << "\"" << rndgentxt << "\"," << std::endl; double minelem = hstStats.minME; diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/RandomNumberKernels.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/RandomNumberKernels.h index 21d63beeac..7ed728a26c 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/RandomNumberKernels.h +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/RandomNumberKernels.h @@ -1,21 +1,22 @@ -// Copyright (C) 2020-2023 CERN and UCLouvain. +// Copyright (C) 2020-2024 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. #ifndef RANDOMNUMBERKERNELS_H #define RANDOMNUMBERKERNELS_H 1 #include "mgOnGpuConfig.h" -// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when MGONGPUCPP_GPUIMPL is not defined -#ifndef MGONGPU_HAS_NO_CURAND -//#include "curand.h" -struct curandGenerator_st; // forward definition from curand.h -#endif - #include "MemoryBuffers.h" +// Forward definition from curand.h (the full header is only needed in CurandRandomKernel.cc) +struct curandGenerator_st; + +// Forward definition from hiprand.h (the full header is only needed in HiprandRandomKernel.cc) +struct rocrand_generator_base_type; +typedef rocrand_generator_base_type hiprandGenerator_st; + #ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else @@ -107,7 +108,6 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef MGONGPU_HAS_NO_CURAND // A class encapsulating CURAND random number generation on a CPU host or on a GPU device class CurandRandomNumberKernel final : public RandomNumberKernelBase { @@ -142,11 +142,49 @@ namespace mg5amcCpu const bool m_isOnDevice; // The curand generator - // (NB: curand.h defines typedef generator_t as a pointer to forward-defined 'struct curandGenerator_st') + // (NB: curand.h defines typedef curandGenerator_t as a pointer to forward-defined 'struct curandGenerator_st') curandGenerator_st* m_rnGen; }; -#endif + //-------------------------------------------------------------------------- + + // A class encapsulating HIPRAND random number generation on a CPU host or on a GPU device + class HiprandRandomNumberKernel final : public RandomNumberKernelBase + { + public: + + // Constructor from an existing output buffer + HiprandRandomNumberKernel( BufferRndNumMomenta& rnarray, const bool onDevice ); + + // Destructor + ~HiprandRandomNumberKernel(); + + // Seed the random number generator + void seedGenerator( const unsigned int seed ) override final; + + // Generate the random number array + void generateRnarray() override final; + + // Is this a host or device kernel? + bool isOnDevice() const override final { return m_isOnDevice; } + + private: + + // Create the generator (workaround for #429: do this in every seedGenerator call rather than only in the ctor) + void createGenerator(); + + // Destroy the generator (workaround for #429: do this in every seedGenerator call rather than only in the ctor) + void destroyGenerator(); + + private: + + // Is this a host or device kernel? + const bool m_isOnDevice; + + // The hiprand generator + // (NB: hiprand.h defines typedef hiprandGenerator_t as a pointer to forward-defined 'struct hiprandGenerator_st') + hiprandGenerator_st* m_rnGen; + }; //-------------------------------------------------------------------------- } diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk index 1077bdc098..3ad91dfd59 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk @@ -173,11 +173,6 @@ ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) comma:=, CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch)) CUINC = -I$(CUDA_HOME)/include/ - ifeq ($(RNDGEN),hasNoCurand) - CURANDLIBFLAGS= - else - CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here! - endif CUOPTFLAGS = -lineinfo ###GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math GPUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math @@ -241,7 +236,7 @@ else override GPUCC= override USE_NVTX= override CUINC= - override CURANDLIBFLAGS= + override HIPINC= endif @@ -291,7 +286,7 @@ endif #------------------------------------------------------------------------------- -#=== Configure defaults and check if user-defined choices exist for OMPFLAGS, AVX, FPTYPE, HELINL, HRDCOD, RNDGEN +#=== Configure defaults and check if user-defined choices exist for OMPFLAGS, AVX, FPTYPE, HELINL, HRDCOD # Set the default OMPFLAGS choice ifneq ($(findstring hipcc,$(GPUCC)),) @@ -352,29 +347,62 @@ ifeq ($(HRDCOD),) override HRDCOD = 0 endif -# Set the default RNDGEN (random number generator) choice -ifeq ($(RNDGEN),) - ifeq ($(GPUCC),) - override RNDGEN = hasNoCurand - # Edgecase for HIP compilation - else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) - override RNDGEN = hasNoCurand - else ifeq ($(RNDGEN),) - override RNDGEN = hasCurand - endif -endif - -# Export AVX, FPTYPE, HELINL, HRDCOD, RNDGEN, OMPFLAGS so that it is not necessary to pass them to the src Makefile too +# Export AVX, FPTYPE, HELINL, HRDCOD, OMPFLAGS so that it is not necessary to pass them to the src Makefile too export AVX export FPTYPE export HELINL export HRDCOD -export RNDGEN export OMPFLAGS #------------------------------------------------------------------------------- -#=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN +#=== Configure defaults and check if user-defined choices exist for RNDGEN (legacy!), HASCURAND, HASHIPRAND + +# If the legacy RNDGEN exists, this take precedence over any HASCURAND choice (but a warning is printed out) +###$(info RNDGEN=$(RNDGEN)) +ifneq ($(RNDGEN),) + $(warning Environment variable RNDGEN is no longer supported, please use HASCURAND instead!) + ifeq ($(RNDGEN),hasCurand) + override HASCURAND = $(RNDGEN) + else ifeq ($(RNDGEN),hasNoCurand) + override HASCURAND = $(RNDGEN) + else ifneq ($(RNDGEN),hasNoCurand) + $(error Unknown RNDGEN='$(RNDGEN)': only 'hasCurand' and 'hasNoCurand' are supported - but use HASCURAND instead!) + endif +endif + +# Set the default HASCURAND (curand random number generator) choice, if no prior choice exists for HASCURAND +# (NB: allow HASCURAND=hasCurand even if $(GPUCC) does not point to nvcc: assume CUDA_HOME was defined correctly...) +ifeq ($(HASCURAND),) + ifeq ($(GPUCC),) # CPU-only build + override HASCURAND = hasNoCurand + else ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build + override HASCURAND = hasCurand + else # non-Nvidia GPU build + override HASCURAND = hasNoCurand + endif +endif + +# Set the default HASHIPRAND (hiprand random number generator) choice, if no prior choice exists for HASHIPRAND +# (NB: allow HASHIPRAND=hasHiprand even if $(GPUCC) does not point to hipcc: assume HIP_HOME was defined correctly...) +ifeq ($(HASHIPRAND),) + ifeq ($(GPUCC),) # CPU-only build + override HASHIPRAND = hasNoHiprand + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build + override HASHIPRAND = hasHiprand + else # non-AMD GPU build + override HASHIPRAND = hasNoHiprand + endif +endif + +# Export HASCURAND, HASHIPRAND so that it is not necessary to pass them to the src Makefile too +# (NB: these variables in cudacpp_src.mk are only used to define the build tag, they are NOT needed for RNDCXXFLAGS or RNDLIBFLAGS) +export HASCURAND +export HASHIPRAND + +#------------------------------------------------------------------------------- + +#=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD # Set the build flags appropriate to OMPFLAGS $(info OMPFLAGS=$(OMPFLAGS)) @@ -432,13 +460,13 @@ CXXFLAGS+= $(AVXFLAGS) $(info FPTYPE=$(FPTYPE)) ifeq ($(FPTYPE),d) CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE - GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE + GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE else ifeq ($(FPTYPE),f) CXXFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT - GPUFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT + GPUFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT else ifeq ($(FPTYPE),m) CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT - GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT + GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT else $(error Unknown FPTYPE='$(FPTYPE)': only 'd', 'f' and 'm' are supported) endif @@ -447,7 +475,7 @@ endif $(info HELINL=$(HELINL)) ifeq ($(HELINL),1) CXXFLAGS += -DMGONGPU_INLINE_HELAMPS - GPUFLAGS += -DMGONGPU_INLINE_HELAMPS + GPUFLAGS += -DMGONGPU_INLINE_HELAMPS else ifneq ($(HELINL),0) $(error Unknown HELINL='$(HELINL)': only '0' and '1' are supported) endif @@ -456,21 +484,40 @@ endif $(info HRDCOD=$(HRDCOD)) ifeq ($(HRDCOD),1) CXXFLAGS += -DMGONGPU_HARDCODE_PARAM - GPUFLAGS += -DMGONGPU_HARDCODE_PARAM + GPUFLAGS += -DMGONGPU_HARDCODE_PARAM else ifneq ($(HRDCOD),0) $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported) endif -# Set the build flags appropriate to each RNDGEN choice (example: "make RNDGEN=hasNoCurand") -$(info RNDGEN=$(RNDGEN)) -ifeq ($(RNDGEN),hasNoCurand) - override CXXFLAGSCURAND = -DMGONGPU_HAS_NO_CURAND -else ifeq ($(RNDGEN),hasCurand) - override CXXFLAGSCURAND = + +#=== Set the CUDA/HIP/C++ compiler and linker flags appropriate to user-defined choices of HASCURAND, HASHIPRAND + +$(info HASCURAND=$(HASCURAND)) +$(info HASHIPRAND=$(HASHIPRAND)) +override RNDCXXFLAGS= +override RNDLIBFLAGS= + +# Set the RNDCXXFLAGS and RNDLIBFLAGS build flags appropriate to each HASCURAND choice (example: "make HASCURAND=hasNoCurand") +ifeq ($(HASCURAND),hasNoCurand) + override RNDCXXFLAGS += -DMGONGPU_HAS_NO_CURAND +else ifeq ($(HASCURAND),hasCurand) + override RNDLIBFLAGS += -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here! else - $(error Unknown RNDGEN='$(RNDGEN)': only 'hasCurand' and 'hasNoCurand' are supported) + $(error Unknown HASCURAND='$(HASCURAND)': only 'hasCurand' and 'hasNoCurand' are supported) +endif + +# Set the RNDCXXFLAGS and RNDLIBFLAGS build flags appropriate to each HASHIPRAND choice (example: "make HASHIPRAND=hasNoHiprand") +ifeq ($(HASHIPRAND),hasNoHiprand) + override RNDCXXFLAGS += -DMGONGPU_HAS_NO_HIPRAND +else ifeq ($(HASHIPRAND),hasHiprand) + override RNDLIBFLAGS += -L$(HIP_HOME)/lib/ -lhiprand +else ifneq ($(HASHIPRAND),hasHiprand) + $(error Unknown HASHIPRAND='$(HASHIPRAND)': only 'hasHiprand' and 'hasNoHiprand' are supported) endif +#$(info RNDCXXFLAGS=$(RNDCXXFLAGS)) +#$(info HASHIPRAND=$(HASHIPRAND)) + #------------------------------------------------------------------------------- #=== Configure build directories and build lockfiles === @@ -481,7 +528,7 @@ override DIRTAG = $(AVX)_$(FPTYPE)_inl$(HELINL)_hrd$(HRDCOD) # Build lockfile "full" tag (defines full specification of build options that cannot be intermixed) # (Rationale: avoid mixing of CUDA and no-CUDA environment builds with different random number generators) -override TAG = $(AVX)_$(FPTYPE)_inl$(HELINL)_hrd$(HRDCOD)_$(RNDGEN) +override TAG = $(AVX)_$(FPTYPE)_inl$(HELINL)_hrd$(HRDCOD)_$(HASCURAND)_$(HASHIPRAND) # Build directory: current directory by default, or build.$(DIRTAG) if USEBUILDDIR==1 ifeq ($(USEBUILDDIR),1) @@ -589,14 +636,19 @@ endif $(BUILDDIR)/check_sa.o: CXXFLAGS += $(USE_NVTX) $(CUINC) $(BUILDDIR)/check_sa_cu.o: CXXFLAGS += $(USE_NVTX) $(CUINC) -# Apply special build flags only to check_sa[_cu].o and CurandRandomNumberKernel[_cu].o (curand headers, #679) -$(BUILDDIR)/check_sa.o: CXXFLAGS += $(CXXFLAGSCURAND) -$(BUILDDIR)/check_sa_cu.o: CUFLAGS += $(CXXFLAGSCURAND) -$(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CXXFLAGSCURAND) -$(BUILDDIR)/CurandRandomNumberKernel_cu.o: CUFLAGS += $(CXXFLAGSCURAND) -ifeq ($(RNDGEN),hasCurand) +# Apply special build flags only to check_sa[_cu].o and (Cu|Hip)randRandomNumberKernel[_cu].o +$(BUILDDIR)/check_sa.o: CXXFLAGS += $(RNDCXXFLAGS) +$(BUILDDIR)/check_sa_cu.o: CUFLAGS += $(RNDCXXFLAGS) +$(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(RNDCXXFLAGS) +$(BUILDDIR)/CurandRandomNumberKernel_cu.o: CUFLAGS += $(RNDCXXFLAGS) +$(BUILDDIR)/HiprandRandomNumberKernel.o: CXXFLAGS += $(RNDCXXFLAGS) +$(BUILDDIR)/HiprandRandomNumberKernel_cu.o: CUFLAGS += $(RNDCXXFLAGS) +ifeq ($(HASCURAND),hasCurand) # curand headers, #679 $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUINC) endif +ifeq ($(HASHIPRAND),hasHiprand) # hiprand headers +$(BUILDDIR)/HiprandRandomNumberKernel.o: CXXFLAGS += $(HIPINC) +endif # Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592) ifneq ($(shell $(CXX) --version | egrep '^(Intel)'),) @@ -673,8 +725,8 @@ endif # Target (and build rules): C++ and CUDA standalone executables $(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o - $(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CURANDLIBFLAGS) +$(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(BUILDDIR)/HiprandRandomNumberKernel.o + $(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(BUILDDIR)/HiprandRandomNumberKernel.o $(RNDLIBFLAGS) ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) @@ -684,8 +736,8 @@ else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(cu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif $(cu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(cu_main): $(BUILDDIR)/check_sa_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_cu.o - $(GPUCC) -o $@ $(BUILDDIR)/check_sa_cu.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_cu.o $(CURANDLIBFLAGS) +$(cu_main): $(BUILDDIR)/check_sa_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_cu.o $(BUILDDIR)/HiprandRandomNumberKernel_cu.o + $(GPUCC) -o $@ $(BUILDDIR)/check_sa_cu.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_cu.o $(BUILDDIR)/HiprandRandomNumberKernel_cu.o $(RNDLIBFLAGS) endif #------------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/runTest.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/runTest.cc index de327f2321..7f248d29a4 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/runTest.cc +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/runTest.cc @@ -238,7 +238,7 @@ struct CUDATest : public CUDA_CPU_TestBase return MemoryAccessMatrixElements::ieventAccessConst( hstMatrixElements.data(), ievt ); } }; -#endif +#endif /* clang-format off */ // Use two levels of macros to force stringification at the right level // (see https://gcc.gnu.org/onlinedocs/gcc-3.0.1/cpp_3.html#SEC17 and https://stackoverflow.com/a/3419392) @@ -260,4 +260,4 @@ INSTANTIATE_TEST_SUITE_P( prefix, \ MG_INSTANTIATE_TEST_SUITE_GPU( XTESTID_GPU( MG_EPOCH_PROCESS_ID ), MadgraphTest ); #else MG_INSTANTIATE_TEST_SUITE_CPU( XTESTID_CPU( MG_EPOCH_PROCESS_ID ), MadgraphTest ); -#endif +#endif /* clang-format on */ diff --git a/epochX/cudacpp/gg_ttggg.sa/src/cudacpp_src.mk b/epochX/cudacpp/gg_ttggg.sa/src/cudacpp_src.mk index b2b9da5288..d7d51259d2 100644 --- a/epochX/cudacpp/gg_ttggg.sa/src/cudacpp_src.mk +++ b/epochX/cudacpp/gg_ttggg.sa/src/cudacpp_src.mk @@ -45,13 +45,13 @@ endif #------------------------------------------------------------------------------- -#=== Configure the CUDA compiler (note: GPUCC is already exported including ccache) +#=== Configure the CUDA compiler (note: GPUCC have been exported from cudacpp.mk including ccache) ###$(info GPUCC=$(GPUCC)) #------------------------------------------------------------------------------- -#=== Configure ccache for C++ builds (note: GPUCC is already exported including ccache) +#=== Configure ccache for C++ builds (note: GPUCC have been exported from cudacpp.mk including ccache) # Enable ccache if USECCACHE=1 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) @@ -86,7 +86,8 @@ endif #------------------------------------------------------------------------------- -#=== Set the CUDA/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN +#=== Set the CUDA/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD (exported from cudacpp.mk) +#=== (NB the RNDCXXFLAGS and RNDLIBFLAGS appropriate to user-defined choices of HASCURAND and HASHIPRAND have been exported from cudacpp.mk) # Set the build flags appropriate to OMPFLAGS ###$(info OMPFLAGS=$(OMPFLAGS)) @@ -175,14 +176,6 @@ else ifneq ($(HRDCOD),0) $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported) endif -# Set the build flags appropriate to each RNDGEN choice (example: "make RNDGEN=hasNoCurand") -###$(info RNDGEN=$(RNDGEN)) -ifeq ($(RNDGEN),hasNoCurand) - CXXFLAGS += -DMGONGPU_HAS_NO_CURAND -else ifneq ($(RNDGEN),hasCurand) - $(error Unknown RNDGEN='$(RNDGEN)': only 'hasCurand' and 'hasNoCurand' are supported) -endif - #------------------------------------------------------------------------------- #=== Configure build directories and build lockfiles === @@ -193,7 +186,7 @@ override DIRTAG = $(AVX)_$(FPTYPE)_inl$(HELINL)_hrd$(HRDCOD) # Build lockfile "full" tag (defines full specification of build options that cannot be intermixed) # (Rationale: avoid mixing of CUDA and no-CUDA environment builds with different random number generators) -override TAG = $(AVX)_$(FPTYPE)_inl$(HELINL)_hrd$(HRDCOD)_$(RNDGEN) +override TAG = $(AVX)_$(FPTYPE)_inl$(HELINL)_hrd$(HRDCOD)_$(HASCURAND)_$(HASHIPRAND) # Build directory: current directory by default, or build.$(DIRTAG) if USEBUILDDIR==1 ###$(info Current directory is $(shell pwd)) diff --git a/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuConfig.h index 475749ca7c..22d6921fda 100644 --- a/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuConfig.h @@ -20,11 +20,15 @@ #undef MGONGPUCPP_GPUIMPL #endif +// Make sure that __HIP_PLATFORM_NVIDIA__ is undefined +// (__HIP_PLATFORM_AMD__ is defined by hipcc or in HiprandRandomNumberKernel.cc) +#undef __HIP_PLATFORM_NVIDIA__ // disable hiprand for NVidia (curand) + // ** NB1 Throughputs (e.g. 6.8E8) are events/sec for "./gcheck.exe -p 65536 128 12" // ** NB2 Baseline on b7g47n0004 fluctuates (probably depends on load on other VMs) // Choose if curand is supported for generating random numbers -// For HIP, by default, do not use curand (common random numbers will be used instead) +// For HIP, by default, do not allow curand to be used (hiprand or common random numbers will be used instead) // For both CUDA and C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND // (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784 and #785) #if defined __HIPCC__ @@ -39,6 +43,22 @@ //#endif #endif +// Choose if hiprand is supported for generating random numbers +// For CUDA, by default, do not allow hiprand to be used (curand or common random numbers will be used instead) +// For both HIP and C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_HIPRAND +// (there may exist HIP installations which do not include hiprand?) +#if defined __CUDACC__ +#define MGONGPU_HAS_NO_HIPRAND 1 +#else +//#ifdef __HIPCC__ +//#undef MGONGPU_HAS_NO_HIPRAND // default +////#define MGONGPU_HAS_NO_HIPRAND 1 +//#else +//#undef MGONGPU_HAS_NO_HIPRAND // default +////#define MGONGPU_HAS_NO_HIPRAND 1 +//#endif +#endif + // Choose floating point precision (for everything but color algebra #537) // If one of these macros has been set from outside with e.g. -DMGONGPU_FPTYPE_FLOAT, nothing happens (issue #167) #if not defined MGONGPU_FPTYPE_DOUBLE and not defined MGONGPU_FPTYPE_FLOAT diff --git a/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt b/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt index e9279c1f7f..e72f8836f6 100644 --- a/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt +++ b/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt @@ -61,7 +61,7 @@ set zerowidth_tchannel F define q = u c d s u~ c~ d~ s~ INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.0056858062744140625  +DEBUG: model prefixing takes 0.005477190017700195  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -170,7 +170,7 @@ INFO: Crossed process found for g u~ > t t~ u~, reuse diagrams. INFO: Crossed process found for g c~ > t t~ c~, reuse diagrams. INFO: Crossed process found for g d~ > t t~ d~, reuse diagrams. INFO: Crossed process found for g s~ > t t~ s~, reuse diagrams. -8 processes with 40 diagrams generated in 0.079 s +8 processes with 40 diagrams generated in 0.077 s Total: 8 processes with 40 diagrams output madevent ../TMPOUT/CODEGEN_mad_gq_ttq --hel_recycling=False --vector_size=32 --me_exporter=standalone_cudacpp Load PLUGIN.CUDACPP_OUTPUT @@ -179,10 +179,10 @@ It has been validated for the last time with version: 3.5.2 Addition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT Output will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: cformat =  standalone_cudacpp [export_cpp.py at line 3071]  -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 160]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 161]  INFO: initialize a new directory: CODEGEN_mad_gq_ttq INFO: remove old information in CODEGEN_mad_gq_ttq -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 165]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 166]  WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq  INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards  @@ -200,7 +200,7 @@ INFO: Combined process g d~ > t t~ d~ WEIGHTED<=3 @1 with process g u~ > t t~ u~ INFO: Combined process g s~ > t t~ s~ WEIGHTED<=3 @1 with process g u~ > t t~ u~ WEIGHTED<=3 @1 INFO: Creating files in directory P1_gu_ttxu DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1057]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -217,7 +217,7 @@ INFO: Generating Feynman diagrams for Process: g u > t t~ u WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gu_ttxu INFO: Creating files in directory P1_gux_ttxux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1057]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -232,17 +232,17 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1871]  INFO: Generating Feynman diagrams for Process: g u~ > t t~ u~ WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gux_ttxux -Generated helas calls for 2 subprocesses (10 diagrams) in 0.032 s -Wrote files for 32 helas calls in 0.223 s +Generated helas calls for 2 subprocesses (10 diagrams) in 0.031 s +Wrote files for 32 helas calls in 0.217 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVV1 routines -ALOHA: aloha creates 2 routines in 0.149 s -DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202]  +ALOHA: aloha creates 2 routines in 0.143 s +DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 203]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVV1 routines -ALOHA: aloha creates 4 routines in 0.136 s +ALOHA: aloha creates 4 routines in 0.130 s FFV1 FFV1 FFV1 @@ -289,17 +289,17 @@ Hunk #2 succeeded at 162 (offset 19 lines). Hunk #3 succeeded at 247 (offset 26 lines). Hunk #4 succeeded at 281 (offset 32 lines). Hunk #5 succeeded at 326 (offset 32 lines). -DEBUG: p.returncode =  0 [output.py at line 237]  +DEBUG: p.returncode =  0 [output.py at line 238]  Output to directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq done. Type "launch" to generate events from this process, or see /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/README Run "open index.html" to see more information about this process. quit -real 0m1.992s -user 0m1.732s -sys 0m0.249s -Code generation completed in 2 seconds +real 0m3.073s +user 0m1.692s +sys 0m0.237s +Code generation completed in 3 seconds ************************************************************ * * * W E L C O M E to * diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/CurandRandomNumberKernel.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/CurandRandomNumberKernel.cc index 08a16f6f2c..c160c5e06b 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/CurandRandomNumberKernel.cc +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/CurandRandomNumberKernel.cc @@ -10,6 +10,7 @@ #include #ifndef MGONGPU_HAS_NO_CURAND /* clang-format off */ +// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when MGONGPUCPP_GPUIMPL is not defined #include "curand.h" #define checkCurand( code ){ assertCurand( code, __FILE__, __LINE__ ); } inline void assertCurand( curandStatus_t code, const char *file, int line, bool abort = true ) diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/HiprandRandomNumberKernel.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/HiprandRandomNumberKernel.cc new file mode 100644 index 0000000000..2e4534f9d4 --- /dev/null +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/HiprandRandomNumberKernel.cc @@ -0,0 +1,145 @@ +// Copyright (C) 2020-2024 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Jan 2024) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2024) for the MG5aMC CUDACPP plugin. + +#include "mgOnGpuConfig.h" + +#include "GpuRuntime.h" +#include "MemoryBuffers.h" +#include "RandomNumberKernels.h" + +#include + +#ifndef MGONGPU_HAS_NO_HIPRAND /* clang-format off */ +#ifndef __HIP_PLATFORM_AMD__ +#define __HIP_PLATFORM_AMD__ 1 // enable hiprand for AMD (rocrand) +#endif +#include +#define checkHiprand( code ){ assertHiprand( code, __FILE__, __LINE__ ); } +inline void assertHiprand( hiprandStatus_t code, const char *file, int line, bool abort = true ) +{ + if ( code != HIPRAND_STATUS_SUCCESS ) + { + printf( "HiprandAssert: %s:%d code=%d\n", file, line, code ); + if ( abort ) assert( code == HIPRAND_STATUS_SUCCESS ); + } +} +#endif /* clang-format on */ + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + //-------------------------------------------------------------------------- +#ifndef MGONGPU_HAS_NO_HIPRAND + HiprandRandomNumberKernel::HiprandRandomNumberKernel( BufferRndNumMomenta& rnarray, const bool onDevice ) + : RandomNumberKernelBase( rnarray ) + , m_isOnDevice( onDevice ) + { + if( m_isOnDevice ) + { +#ifdef MGONGPUCPP_GPUIMPL + if( !m_rnarray.isOnDevice() ) + throw std::runtime_error( "HiprandRandomNumberKernel on device with a host random number array" ); +#else + throw std::runtime_error( "HiprandRandomNumberKernel does not support HiprandDevice on CPU host" ); +#endif + } + else + { + if( m_rnarray.isOnDevice() ) + throw std::runtime_error( "HiprandRandomNumberKernel on host with a device random number array" ); + } + createGenerator(); + } + + //-------------------------------------------------------------------------- + + HiprandRandomNumberKernel::~HiprandRandomNumberKernel() + { + destroyGenerator(); + } + + //-------------------------------------------------------------------------- + + void HiprandRandomNumberKernel::seedGenerator( const unsigned int seed ) + { + if( m_isOnDevice ) + { + destroyGenerator(); // workaround for #429 + createGenerator(); // workaround for #429 + } + //printf( "seedGenerator: seed %d\n", seed ); + checkHiprand( hiprandSetPseudoRandomGeneratorSeed( m_rnGen, seed ) ); + } + + //-------------------------------------------------------------------------- + + void HiprandRandomNumberKernel::createGenerator() + { + //const hiprandRngType_t type = HIPRAND_RNG_PSEUDO_DEFAULT; + //const hiprandRngType_t type = HIPRAND_RNG_PSEUDO_XORWOW; + //const hiprandRngType_t type = HIPRAND_RNG_PSEUDO_MRG32K3A; + const hiprandRngType_t type = HIPRAND_RNG_PSEUDO_MTGP32; // same as curand; not implemented yet (code=1000) in host code + //const hiprandRngType_t type = HIPRAND_RNG_PSEUDO_MT19937; + //const hiprandRngType_t type = HIPRAND_RNG_PSEUDO_PHILOX4_32_10; + if( m_isOnDevice ) + { + checkHiprand( hiprandCreateGenerator( &m_rnGen, type ) ); + } + else + { + // See https://github.com/ROCm/hipRAND/issues/76 + throw std::runtime_error( "HiprandRandomNumberKernel on host is not supported yet (hiprandCreateGeneratorHost is not implemented yet)" ); + //checkHiprand( hiprandCreateGeneratorHost( &m_rnGen, type ) ); // ALWAYS FAILS WITH CODE=1000 + } + // FIXME: hiprand ordering is not implemented yet + // See https://github.com/ROCm/hipRAND/issues/75 + /* + //checkHiprand( hiprandSetGeneratorOrdering( *&m_rnGen, HIPRAND_ORDERING_PSEUDO_LEGACY ) ); + checkHiprand( hiprandSetGeneratorOrdering( *&m_rnGen, HIPRAND_ORDERING_PSEUDO_BEST ) ); + //checkHiprand( hiprandSetGeneratorOrdering( *&m_rnGen, HIPRAND_ORDERING_PSEUDO_DYNAMIC ) ); + //checkHiprand( hiprandSetGeneratorOrdering( *&m_rnGen, HIPRAND_ORDERING_PSEUDO_SEEDED ) ); + */ + } + + //-------------------------------------------------------------------------- + + void HiprandRandomNumberKernel::destroyGenerator() + { + checkHiprand( hiprandDestroyGenerator( m_rnGen ) ); + } + + //-------------------------------------------------------------------------- + + void HiprandRandomNumberKernel::generateRnarray() + { +#if defined MGONGPU_FPTYPE_DOUBLE + checkHiprand( hiprandGenerateUniformDouble( m_rnGen, m_rnarray.data(), m_rnarray.size() ) ); +#elif defined MGONGPU_FPTYPE_FLOAT + checkHiprand( hiprandGenerateUniform( m_rnGen, m_rnarray.data(), m_rnarray.size() ) ); +#endif + /* + printf( "\nHiprandRandomNumberKernel::generateRnarray size = %d\n", (int)m_rnarray.size() ); + fptype* data = m_rnarray.data(); +#ifdef MGONGPUCPP_GPUIMPL + if( m_rnarray.isOnDevice() ) + { + data = new fptype[m_rnarray.size()](); + checkCuda( cudaMemcpy( data, m_rnarray.data(), m_rnarray.bytes(), cudaMemcpyDeviceToHost ) ); + } +#endif + for( int i = 0; i < ( (int)m_rnarray.size() / 4 ); i++ ) + printf( "[%4d] %f %f %f %f\n", i * 4, data[i * 4], data[i * 4 + 2], data[i * 4 + 2], data[i * 4 + 3] ); +#ifdef MGONGPUCPP_GPUIMPL + if( m_rnarray.isOnDevice() ) delete[] data; +#endif + */ + } + + //-------------------------------------------------------------------------- +#endif +} diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/HiprandRandomNumberKernel.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/HiprandRandomNumberKernel.cc new file mode 120000 index 0000000000..6691864f78 --- /dev/null +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/HiprandRandomNumberKernel.cc @@ -0,0 +1 @@ +../HiprandRandomNumberKernel.cc \ No newline at end of file diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/check_sa.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/check_sa.cc index bde384c69e..e086ae12d9 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/check_sa.cc +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/check_sa.cc @@ -1,10 +1,10 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2023 CERN and UCLouvain. +// Copyright (C) 2020-2024 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -58,7 +58,7 @@ int usage( char* argv0, int ret = 1 ) { std::cout << "Usage: " << argv0 - << " [--verbose|-v] [--debug|-d] [--performance|-p] [--json|-j] [--curhst|--curdev|--common] [--rmbhst|--rmbdev] [--bridge]" + << " [--verbose|-v] [--debug|-d] [--performance|-p] [--json|-j] [--curhst|--curdev|--hirhst|--hirdev|--common] [--rmbhst|--rmbdev] [--bridge]" << " [#gpuBlocksPerGrid #gpuThreadsPerBlock] #iterations" << std::endl; std::cout << std::endl; std::cout << "The number of events per iteration is #gpuBlocksPerGrid * #gpuThreadsPerBlock" << std::endl; @@ -131,17 +131,31 @@ main( int argc, char** argv ) enum class RandomNumberMode { CommonRandom = 0, - CurandHost = 1, - CurandDevice = 2 + CurandHost = -1, + CurandDevice = 1, + HiprandHost = -2, + HiprandDevice = 2 }; -#ifdef MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785) -#elif defined __HIPCC__ -#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random -#elif defined __CUDACC__ +#if defined __CUDACC__ +#ifndef MGONGPU_HAS_NO_CURAND RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand #else + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on NVidia GPU if build has no curand (PR #784 and #785) +#endif +#elif defined __HIPCC__ +#ifndef MGONGPU_HAS_NO_HIPRAND + RandomNumberMode rndgen = RandomNumberMode::HiprandDevice; // default on AMD GPU if build has hiprand +#else + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on AMD GPU if build has no hiprand +#endif +#else +#ifndef MGONGPU_HAS_NO_CURAND RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand +#elif not defined MGONGPU_HAS_NO_HIPRAND + RandomNumberMode rndgen = RandomNumberMode::HiprandDevice; // default on CPU if build has hiprand +#else + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has neither curand nor hiprand +#endif #endif // Rambo sampling mode (NB RamboHost implies CommonRandom or CurandHost!) enum class RamboSamplingMode @@ -152,7 +166,7 @@ main( int argc, char** argv ) #ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -193,6 +207,26 @@ main( int argc, char** argv ) throw std::runtime_error( "CurandHost is not supported because this application was built without Curand support" ); #else rndgen = RandomNumberMode::CurandHost; +#endif + } + else if( arg == "--hirdev" ) + { +#ifndef __HIPCC__ + throw std::runtime_error( "HiprandDevice is not supported on CPUs or non-AMD GPUs" ); +#elif defined MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "HiprandDevice is not supported because this application was built without Hiprand support" ); +#else + rndgen = RandomNumberMode::HiprandDevice; +#endif + } + else if( arg == "--hirhst" ) + { +#ifdef MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "HiprandHost is not supported because this application was built without Hiprand support" ); +#else + // See https://github.com/ROCm/hipRAND/issues/76 + throw std::runtime_error( "HiprandRandomNumberKernel on host is not supported yet (hiprandCreateGeneratorHost is not implemented yet)" ); + //rndgen = RandomNumberMode::HiprandHost; #endif } else if( arg == "--common" ) @@ -265,6 +299,20 @@ main( int argc, char** argv ) #endif } + if( rmbsmp == RamboSamplingMode::RamboHost && rndgen == RandomNumberMode::HiprandDevice ) + { +#if not defined MGONGPU_HAS_NO_HIPRAND + // See https://github.com/ROCm/hipRAND/issues/76 + //std::cout << "WARNING! RamboHost selected: cannot use HiprandDevice, will use HiprandHost" << std::endl; + //rndgen = RandomNumberMode::HiprandHost; + std::cout << "WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet)" << std::endl; + rndgen = RandomNumberMode::CommonRandom; +#else + std::cout << "WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom" << std::endl; + rndgen = RandomNumberMode::CommonRandom; +#endif + } + constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout constexpr int neppR = MemoryAccessRandomNumbers::neppR; // AOSOA layout @@ -415,7 +463,7 @@ main( int argc, char** argv ) std::unique_ptr wavetimes( new double[niter] ); std::unique_ptr wv3atimes( new double[niter] ); - // --- 0c. Create curand or common generator + // --- 0c. Create curand, hiprand or common generator const std::string cgenKey = "0c GenCreat"; timermap.start( cgenKey ); // Allocate the appropriate RandomNumberKernel @@ -433,7 +481,7 @@ main( int argc, char** argv ) prnk.reset( new CurandRandomNumberKernel( hstRndmom, onDevice ) ); #endif } - else + else if( rndgen == RandomNumberMode::CurandDevice ) { #ifdef MGONGPU_HAS_NO_CURAND throw std::runtime_error( "INTERNAL ERROR! CurandDevice is not supported because this application was built without Curand support" ); // INTERNAL ERROR (no path to this statement) @@ -441,9 +489,31 @@ main( int argc, char** argv ) const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); #else - throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif } + else if( rndgen == RandomNumberMode::HiprandHost ) + { +#ifdef MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "INTERNAL ERROR! HiprandHost is not supported because this application was built without Hiprand support" ); // INTERNAL ERROR (no path to this statement) +#else + const bool onDevice = false; + prnk.reset( new HiprandRandomNumberKernel( hstRndmom, onDevice ) ); +#endif + } + else if( rndgen == RandomNumberMode::HiprandDevice ) + { +#ifdef MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "INTERNAL ERROR! HiprandDevice is not supported because this application was built without Hiprand support" ); // INTERNAL ERROR (no path to this statement) +#elif defined __HIPCC__ + const bool onDevice = true; + prnk.reset( new HiprandRandomNumberKernel( devRndmom, onDevice ) ); +#else + throw std::logic_error( "INTERNAL ERROR! HiprandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) +#endif + } + else + throw std::logic_error( "INTERNAL ERROR! Unknown rndgen value?" ); // INTERNAL ERROR (no path to this statement) // --- 0c. Create rambo sampling kernel [keep this in 0c for the moment] std::unique_ptr prsk; @@ -497,7 +567,7 @@ main( int argc, char** argv ) // *** START THE OLD-STYLE TIMER FOR RANDOM GEN *** double genrtime = 0; - // --- 1a. Seed rnd generator (to get same results on host and device in curand) + // --- 1a. Seed rnd generator (to get same results on host and device in curand/hiprand) // [NB This should not be necessary using the host API: "Generation functions // can be called multiple times on the same generator to generate successive // blocks of results. For pseudorandom generators, multiple calls to generation @@ -515,7 +585,9 @@ main( int argc, char** argv ) //std::cout << "Got random numbers" << std::endl; #ifdef MGONGPUCPP_GPUIMPL - if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) + if( rndgen != RandomNumberMode::CurandDevice && + rndgen != RandomNumberMode::HiprandDevice && + rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device const std::string htodKey = "1c CpHTDrnd"; @@ -761,6 +833,10 @@ main( int argc, char** argv ) rndgentxt = "CURAND HOST"; else if( rndgen == RandomNumberMode::CurandDevice ) rndgentxt = "CURAND DEVICE"; + else if( rndgen == RandomNumberMode::HiprandHost ) + rndgentxt = "ROCRAND HOST"; + else if( rndgen == RandomNumberMode::HiprandDevice ) + rndgentxt = "ROCRAND DEVICE"; #ifdef __CUDACC__ rndgentxt += " (CUDA code)"; #elif defined __HIPCC__ @@ -788,7 +864,7 @@ main( int argc, char** argv ) wrkflwtxt += "FLT+"; #else wrkflwtxt += "???+"; // no path to this statement -#endif /* clang-format on */ +#endif // -- CUCOMPLEX or THRUST or STD or CXSIMPLE complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -813,7 +889,7 @@ main( int argc, char** argv ) wrkflwtxt += "CXS:"; #else wrkflwtxt += "???:"; // no path to this statement -#endif +#endif /* clang-format on */ #endif // -- COMMON or CURAND HOST or CURAND DEVICE random numbers? if( rndgen == RandomNumberMode::CommonRandom ) @@ -822,6 +898,10 @@ main( int argc, char** argv ) wrkflwtxt += "CURHST+"; else if( rndgen == RandomNumberMode::CurandDevice ) wrkflwtxt += "CURDEV+"; + else if( rndgen == RandomNumberMode::HiprandHost ) + wrkflwtxt += "HIRHST+"; + else if( rndgen == RandomNumberMode::HiprandDevice ) + wrkflwtxt += "HIRDEV+"; else wrkflwtxt += "??????+"; // no path to this statement // -- HOST or DEVICE rambo sampling? @@ -1099,7 +1179,7 @@ main( int argc, char** argv ) #ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif - << "\"Curand generation\": " + << "\"Random generation\": " << "\"" << rndgentxt << "\"," << std::endl; double minelem = hstStats.minME; diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/HiprandRandomNumberKernel.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/HiprandRandomNumberKernel.cc new file mode 120000 index 0000000000..6691864f78 --- /dev/null +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/HiprandRandomNumberKernel.cc @@ -0,0 +1 @@ +../HiprandRandomNumberKernel.cc \ No newline at end of file diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/check_sa.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/check_sa.cc index bde384c69e..e086ae12d9 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/check_sa.cc +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/check_sa.cc @@ -1,10 +1,10 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2023 CERN and UCLouvain. +// Copyright (C) 2020-2024 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -58,7 +58,7 @@ int usage( char* argv0, int ret = 1 ) { std::cout << "Usage: " << argv0 - << " [--verbose|-v] [--debug|-d] [--performance|-p] [--json|-j] [--curhst|--curdev|--common] [--rmbhst|--rmbdev] [--bridge]" + << " [--verbose|-v] [--debug|-d] [--performance|-p] [--json|-j] [--curhst|--curdev|--hirhst|--hirdev|--common] [--rmbhst|--rmbdev] [--bridge]" << " [#gpuBlocksPerGrid #gpuThreadsPerBlock] #iterations" << std::endl; std::cout << std::endl; std::cout << "The number of events per iteration is #gpuBlocksPerGrid * #gpuThreadsPerBlock" << std::endl; @@ -131,17 +131,31 @@ main( int argc, char** argv ) enum class RandomNumberMode { CommonRandom = 0, - CurandHost = 1, - CurandDevice = 2 + CurandHost = -1, + CurandDevice = 1, + HiprandHost = -2, + HiprandDevice = 2 }; -#ifdef MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785) -#elif defined __HIPCC__ -#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random -#elif defined __CUDACC__ +#if defined __CUDACC__ +#ifndef MGONGPU_HAS_NO_CURAND RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand #else + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on NVidia GPU if build has no curand (PR #784 and #785) +#endif +#elif defined __HIPCC__ +#ifndef MGONGPU_HAS_NO_HIPRAND + RandomNumberMode rndgen = RandomNumberMode::HiprandDevice; // default on AMD GPU if build has hiprand +#else + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on AMD GPU if build has no hiprand +#endif +#else +#ifndef MGONGPU_HAS_NO_CURAND RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand +#elif not defined MGONGPU_HAS_NO_HIPRAND + RandomNumberMode rndgen = RandomNumberMode::HiprandDevice; // default on CPU if build has hiprand +#else + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has neither curand nor hiprand +#endif #endif // Rambo sampling mode (NB RamboHost implies CommonRandom or CurandHost!) enum class RamboSamplingMode @@ -152,7 +166,7 @@ main( int argc, char** argv ) #ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -193,6 +207,26 @@ main( int argc, char** argv ) throw std::runtime_error( "CurandHost is not supported because this application was built without Curand support" ); #else rndgen = RandomNumberMode::CurandHost; +#endif + } + else if( arg == "--hirdev" ) + { +#ifndef __HIPCC__ + throw std::runtime_error( "HiprandDevice is not supported on CPUs or non-AMD GPUs" ); +#elif defined MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "HiprandDevice is not supported because this application was built without Hiprand support" ); +#else + rndgen = RandomNumberMode::HiprandDevice; +#endif + } + else if( arg == "--hirhst" ) + { +#ifdef MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "HiprandHost is not supported because this application was built without Hiprand support" ); +#else + // See https://github.com/ROCm/hipRAND/issues/76 + throw std::runtime_error( "HiprandRandomNumberKernel on host is not supported yet (hiprandCreateGeneratorHost is not implemented yet)" ); + //rndgen = RandomNumberMode::HiprandHost; #endif } else if( arg == "--common" ) @@ -265,6 +299,20 @@ main( int argc, char** argv ) #endif } + if( rmbsmp == RamboSamplingMode::RamboHost && rndgen == RandomNumberMode::HiprandDevice ) + { +#if not defined MGONGPU_HAS_NO_HIPRAND + // See https://github.com/ROCm/hipRAND/issues/76 + //std::cout << "WARNING! RamboHost selected: cannot use HiprandDevice, will use HiprandHost" << std::endl; + //rndgen = RandomNumberMode::HiprandHost; + std::cout << "WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet)" << std::endl; + rndgen = RandomNumberMode::CommonRandom; +#else + std::cout << "WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom" << std::endl; + rndgen = RandomNumberMode::CommonRandom; +#endif + } + constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout constexpr int neppR = MemoryAccessRandomNumbers::neppR; // AOSOA layout @@ -415,7 +463,7 @@ main( int argc, char** argv ) std::unique_ptr wavetimes( new double[niter] ); std::unique_ptr wv3atimes( new double[niter] ); - // --- 0c. Create curand or common generator + // --- 0c. Create curand, hiprand or common generator const std::string cgenKey = "0c GenCreat"; timermap.start( cgenKey ); // Allocate the appropriate RandomNumberKernel @@ -433,7 +481,7 @@ main( int argc, char** argv ) prnk.reset( new CurandRandomNumberKernel( hstRndmom, onDevice ) ); #endif } - else + else if( rndgen == RandomNumberMode::CurandDevice ) { #ifdef MGONGPU_HAS_NO_CURAND throw std::runtime_error( "INTERNAL ERROR! CurandDevice is not supported because this application was built without Curand support" ); // INTERNAL ERROR (no path to this statement) @@ -441,9 +489,31 @@ main( int argc, char** argv ) const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); #else - throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif } + else if( rndgen == RandomNumberMode::HiprandHost ) + { +#ifdef MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "INTERNAL ERROR! HiprandHost is not supported because this application was built without Hiprand support" ); // INTERNAL ERROR (no path to this statement) +#else + const bool onDevice = false; + prnk.reset( new HiprandRandomNumberKernel( hstRndmom, onDevice ) ); +#endif + } + else if( rndgen == RandomNumberMode::HiprandDevice ) + { +#ifdef MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "INTERNAL ERROR! HiprandDevice is not supported because this application was built without Hiprand support" ); // INTERNAL ERROR (no path to this statement) +#elif defined __HIPCC__ + const bool onDevice = true; + prnk.reset( new HiprandRandomNumberKernel( devRndmom, onDevice ) ); +#else + throw std::logic_error( "INTERNAL ERROR! HiprandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) +#endif + } + else + throw std::logic_error( "INTERNAL ERROR! Unknown rndgen value?" ); // INTERNAL ERROR (no path to this statement) // --- 0c. Create rambo sampling kernel [keep this in 0c for the moment] std::unique_ptr prsk; @@ -497,7 +567,7 @@ main( int argc, char** argv ) // *** START THE OLD-STYLE TIMER FOR RANDOM GEN *** double genrtime = 0; - // --- 1a. Seed rnd generator (to get same results on host and device in curand) + // --- 1a. Seed rnd generator (to get same results on host and device in curand/hiprand) // [NB This should not be necessary using the host API: "Generation functions // can be called multiple times on the same generator to generate successive // blocks of results. For pseudorandom generators, multiple calls to generation @@ -515,7 +585,9 @@ main( int argc, char** argv ) //std::cout << "Got random numbers" << std::endl; #ifdef MGONGPUCPP_GPUIMPL - if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) + if( rndgen != RandomNumberMode::CurandDevice && + rndgen != RandomNumberMode::HiprandDevice && + rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device const std::string htodKey = "1c CpHTDrnd"; @@ -761,6 +833,10 @@ main( int argc, char** argv ) rndgentxt = "CURAND HOST"; else if( rndgen == RandomNumberMode::CurandDevice ) rndgentxt = "CURAND DEVICE"; + else if( rndgen == RandomNumberMode::HiprandHost ) + rndgentxt = "ROCRAND HOST"; + else if( rndgen == RandomNumberMode::HiprandDevice ) + rndgentxt = "ROCRAND DEVICE"; #ifdef __CUDACC__ rndgentxt += " (CUDA code)"; #elif defined __HIPCC__ @@ -788,7 +864,7 @@ main( int argc, char** argv ) wrkflwtxt += "FLT+"; #else wrkflwtxt += "???+"; // no path to this statement -#endif /* clang-format on */ +#endif // -- CUCOMPLEX or THRUST or STD or CXSIMPLE complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -813,7 +889,7 @@ main( int argc, char** argv ) wrkflwtxt += "CXS:"; #else wrkflwtxt += "???:"; // no path to this statement -#endif +#endif /* clang-format on */ #endif // -- COMMON or CURAND HOST or CURAND DEVICE random numbers? if( rndgen == RandomNumberMode::CommonRandom ) @@ -822,6 +898,10 @@ main( int argc, char** argv ) wrkflwtxt += "CURHST+"; else if( rndgen == RandomNumberMode::CurandDevice ) wrkflwtxt += "CURDEV+"; + else if( rndgen == RandomNumberMode::HiprandHost ) + wrkflwtxt += "HIRHST+"; + else if( rndgen == RandomNumberMode::HiprandDevice ) + wrkflwtxt += "HIRDEV+"; else wrkflwtxt += "??????+"; // no path to this statement // -- HOST or DEVICE rambo sampling? @@ -1099,7 +1179,7 @@ main( int argc, char** argv ) #ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif - << "\"Curand generation\": " + << "\"Random generation\": " << "\"" << rndgentxt << "\"," << std::endl; double minelem = hstStats.minME; diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/RandomNumberKernels.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/RandomNumberKernels.h index 21d63beeac..7ed728a26c 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/RandomNumberKernels.h +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/RandomNumberKernels.h @@ -1,21 +1,22 @@ -// Copyright (C) 2020-2023 CERN and UCLouvain. +// Copyright (C) 2020-2024 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. #ifndef RANDOMNUMBERKERNELS_H #define RANDOMNUMBERKERNELS_H 1 #include "mgOnGpuConfig.h" -// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when MGONGPUCPP_GPUIMPL is not defined -#ifndef MGONGPU_HAS_NO_CURAND -//#include "curand.h" -struct curandGenerator_st; // forward definition from curand.h -#endif - #include "MemoryBuffers.h" +// Forward definition from curand.h (the full header is only needed in CurandRandomKernel.cc) +struct curandGenerator_st; + +// Forward definition from hiprand.h (the full header is only needed in HiprandRandomKernel.cc) +struct rocrand_generator_base_type; +typedef rocrand_generator_base_type hiprandGenerator_st; + #ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else @@ -107,7 +108,6 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef MGONGPU_HAS_NO_CURAND // A class encapsulating CURAND random number generation on a CPU host or on a GPU device class CurandRandomNumberKernel final : public RandomNumberKernelBase { @@ -142,11 +142,49 @@ namespace mg5amcCpu const bool m_isOnDevice; // The curand generator - // (NB: curand.h defines typedef generator_t as a pointer to forward-defined 'struct curandGenerator_st') + // (NB: curand.h defines typedef curandGenerator_t as a pointer to forward-defined 'struct curandGenerator_st') curandGenerator_st* m_rnGen; }; -#endif + //-------------------------------------------------------------------------- + + // A class encapsulating HIPRAND random number generation on a CPU host or on a GPU device + class HiprandRandomNumberKernel final : public RandomNumberKernelBase + { + public: + + // Constructor from an existing output buffer + HiprandRandomNumberKernel( BufferRndNumMomenta& rnarray, const bool onDevice ); + + // Destructor + ~HiprandRandomNumberKernel(); + + // Seed the random number generator + void seedGenerator( const unsigned int seed ) override final; + + // Generate the random number array + void generateRnarray() override final; + + // Is this a host or device kernel? + bool isOnDevice() const override final { return m_isOnDevice; } + + private: + + // Create the generator (workaround for #429: do this in every seedGenerator call rather than only in the ctor) + void createGenerator(); + + // Destroy the generator (workaround for #429: do this in every seedGenerator call rather than only in the ctor) + void destroyGenerator(); + + private: + + // Is this a host or device kernel? + const bool m_isOnDevice; + + // The hiprand generator + // (NB: hiprand.h defines typedef hiprandGenerator_t as a pointer to forward-defined 'struct hiprandGenerator_st') + hiprandGenerator_st* m_rnGen; + }; //-------------------------------------------------------------------------- } diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp.mk index 1077bdc098..3ad91dfd59 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp.mk @@ -173,11 +173,6 @@ ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) comma:=, CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch)) CUINC = -I$(CUDA_HOME)/include/ - ifeq ($(RNDGEN),hasNoCurand) - CURANDLIBFLAGS= - else - CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here! - endif CUOPTFLAGS = -lineinfo ###GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math GPUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math @@ -241,7 +236,7 @@ else override GPUCC= override USE_NVTX= override CUINC= - override CURANDLIBFLAGS= + override HIPINC= endif @@ -291,7 +286,7 @@ endif #------------------------------------------------------------------------------- -#=== Configure defaults and check if user-defined choices exist for OMPFLAGS, AVX, FPTYPE, HELINL, HRDCOD, RNDGEN +#=== Configure defaults and check if user-defined choices exist for OMPFLAGS, AVX, FPTYPE, HELINL, HRDCOD # Set the default OMPFLAGS choice ifneq ($(findstring hipcc,$(GPUCC)),) @@ -352,29 +347,62 @@ ifeq ($(HRDCOD),) override HRDCOD = 0 endif -# Set the default RNDGEN (random number generator) choice -ifeq ($(RNDGEN),) - ifeq ($(GPUCC),) - override RNDGEN = hasNoCurand - # Edgecase for HIP compilation - else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) - override RNDGEN = hasNoCurand - else ifeq ($(RNDGEN),) - override RNDGEN = hasCurand - endif -endif - -# Export AVX, FPTYPE, HELINL, HRDCOD, RNDGEN, OMPFLAGS so that it is not necessary to pass them to the src Makefile too +# Export AVX, FPTYPE, HELINL, HRDCOD, OMPFLAGS so that it is not necessary to pass them to the src Makefile too export AVX export FPTYPE export HELINL export HRDCOD -export RNDGEN export OMPFLAGS #------------------------------------------------------------------------------- -#=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN +#=== Configure defaults and check if user-defined choices exist for RNDGEN (legacy!), HASCURAND, HASHIPRAND + +# If the legacy RNDGEN exists, this take precedence over any HASCURAND choice (but a warning is printed out) +###$(info RNDGEN=$(RNDGEN)) +ifneq ($(RNDGEN),) + $(warning Environment variable RNDGEN is no longer supported, please use HASCURAND instead!) + ifeq ($(RNDGEN),hasCurand) + override HASCURAND = $(RNDGEN) + else ifeq ($(RNDGEN),hasNoCurand) + override HASCURAND = $(RNDGEN) + else ifneq ($(RNDGEN),hasNoCurand) + $(error Unknown RNDGEN='$(RNDGEN)': only 'hasCurand' and 'hasNoCurand' are supported - but use HASCURAND instead!) + endif +endif + +# Set the default HASCURAND (curand random number generator) choice, if no prior choice exists for HASCURAND +# (NB: allow HASCURAND=hasCurand even if $(GPUCC) does not point to nvcc: assume CUDA_HOME was defined correctly...) +ifeq ($(HASCURAND),) + ifeq ($(GPUCC),) # CPU-only build + override HASCURAND = hasNoCurand + else ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build + override HASCURAND = hasCurand + else # non-Nvidia GPU build + override HASCURAND = hasNoCurand + endif +endif + +# Set the default HASHIPRAND (hiprand random number generator) choice, if no prior choice exists for HASHIPRAND +# (NB: allow HASHIPRAND=hasHiprand even if $(GPUCC) does not point to hipcc: assume HIP_HOME was defined correctly...) +ifeq ($(HASHIPRAND),) + ifeq ($(GPUCC),) # CPU-only build + override HASHIPRAND = hasNoHiprand + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build + override HASHIPRAND = hasHiprand + else # non-AMD GPU build + override HASHIPRAND = hasNoHiprand + endif +endif + +# Export HASCURAND, HASHIPRAND so that it is not necessary to pass them to the src Makefile too +# (NB: these variables in cudacpp_src.mk are only used to define the build tag, they are NOT needed for RNDCXXFLAGS or RNDLIBFLAGS) +export HASCURAND +export HASHIPRAND + +#------------------------------------------------------------------------------- + +#=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD # Set the build flags appropriate to OMPFLAGS $(info OMPFLAGS=$(OMPFLAGS)) @@ -432,13 +460,13 @@ CXXFLAGS+= $(AVXFLAGS) $(info FPTYPE=$(FPTYPE)) ifeq ($(FPTYPE),d) CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE - GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE + GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE else ifeq ($(FPTYPE),f) CXXFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT - GPUFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT + GPUFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT else ifeq ($(FPTYPE),m) CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT - GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT + GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT else $(error Unknown FPTYPE='$(FPTYPE)': only 'd', 'f' and 'm' are supported) endif @@ -447,7 +475,7 @@ endif $(info HELINL=$(HELINL)) ifeq ($(HELINL),1) CXXFLAGS += -DMGONGPU_INLINE_HELAMPS - GPUFLAGS += -DMGONGPU_INLINE_HELAMPS + GPUFLAGS += -DMGONGPU_INLINE_HELAMPS else ifneq ($(HELINL),0) $(error Unknown HELINL='$(HELINL)': only '0' and '1' are supported) endif @@ -456,21 +484,40 @@ endif $(info HRDCOD=$(HRDCOD)) ifeq ($(HRDCOD),1) CXXFLAGS += -DMGONGPU_HARDCODE_PARAM - GPUFLAGS += -DMGONGPU_HARDCODE_PARAM + GPUFLAGS += -DMGONGPU_HARDCODE_PARAM else ifneq ($(HRDCOD),0) $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported) endif -# Set the build flags appropriate to each RNDGEN choice (example: "make RNDGEN=hasNoCurand") -$(info RNDGEN=$(RNDGEN)) -ifeq ($(RNDGEN),hasNoCurand) - override CXXFLAGSCURAND = -DMGONGPU_HAS_NO_CURAND -else ifeq ($(RNDGEN),hasCurand) - override CXXFLAGSCURAND = + +#=== Set the CUDA/HIP/C++ compiler and linker flags appropriate to user-defined choices of HASCURAND, HASHIPRAND + +$(info HASCURAND=$(HASCURAND)) +$(info HASHIPRAND=$(HASHIPRAND)) +override RNDCXXFLAGS= +override RNDLIBFLAGS= + +# Set the RNDCXXFLAGS and RNDLIBFLAGS build flags appropriate to each HASCURAND choice (example: "make HASCURAND=hasNoCurand") +ifeq ($(HASCURAND),hasNoCurand) + override RNDCXXFLAGS += -DMGONGPU_HAS_NO_CURAND +else ifeq ($(HASCURAND),hasCurand) + override RNDLIBFLAGS += -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here! else - $(error Unknown RNDGEN='$(RNDGEN)': only 'hasCurand' and 'hasNoCurand' are supported) + $(error Unknown HASCURAND='$(HASCURAND)': only 'hasCurand' and 'hasNoCurand' are supported) +endif + +# Set the RNDCXXFLAGS and RNDLIBFLAGS build flags appropriate to each HASHIPRAND choice (example: "make HASHIPRAND=hasNoHiprand") +ifeq ($(HASHIPRAND),hasNoHiprand) + override RNDCXXFLAGS += -DMGONGPU_HAS_NO_HIPRAND +else ifeq ($(HASHIPRAND),hasHiprand) + override RNDLIBFLAGS += -L$(HIP_HOME)/lib/ -lhiprand +else ifneq ($(HASHIPRAND),hasHiprand) + $(error Unknown HASHIPRAND='$(HASHIPRAND)': only 'hasHiprand' and 'hasNoHiprand' are supported) endif +#$(info RNDCXXFLAGS=$(RNDCXXFLAGS)) +#$(info HASHIPRAND=$(HASHIPRAND)) + #------------------------------------------------------------------------------- #=== Configure build directories and build lockfiles === @@ -481,7 +528,7 @@ override DIRTAG = $(AVX)_$(FPTYPE)_inl$(HELINL)_hrd$(HRDCOD) # Build lockfile "full" tag (defines full specification of build options that cannot be intermixed) # (Rationale: avoid mixing of CUDA and no-CUDA environment builds with different random number generators) -override TAG = $(AVX)_$(FPTYPE)_inl$(HELINL)_hrd$(HRDCOD)_$(RNDGEN) +override TAG = $(AVX)_$(FPTYPE)_inl$(HELINL)_hrd$(HRDCOD)_$(HASCURAND)_$(HASHIPRAND) # Build directory: current directory by default, or build.$(DIRTAG) if USEBUILDDIR==1 ifeq ($(USEBUILDDIR),1) @@ -589,14 +636,19 @@ endif $(BUILDDIR)/check_sa.o: CXXFLAGS += $(USE_NVTX) $(CUINC) $(BUILDDIR)/check_sa_cu.o: CXXFLAGS += $(USE_NVTX) $(CUINC) -# Apply special build flags only to check_sa[_cu].o and CurandRandomNumberKernel[_cu].o (curand headers, #679) -$(BUILDDIR)/check_sa.o: CXXFLAGS += $(CXXFLAGSCURAND) -$(BUILDDIR)/check_sa_cu.o: CUFLAGS += $(CXXFLAGSCURAND) -$(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CXXFLAGSCURAND) -$(BUILDDIR)/CurandRandomNumberKernel_cu.o: CUFLAGS += $(CXXFLAGSCURAND) -ifeq ($(RNDGEN),hasCurand) +# Apply special build flags only to check_sa[_cu].o and (Cu|Hip)randRandomNumberKernel[_cu].o +$(BUILDDIR)/check_sa.o: CXXFLAGS += $(RNDCXXFLAGS) +$(BUILDDIR)/check_sa_cu.o: CUFLAGS += $(RNDCXXFLAGS) +$(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(RNDCXXFLAGS) +$(BUILDDIR)/CurandRandomNumberKernel_cu.o: CUFLAGS += $(RNDCXXFLAGS) +$(BUILDDIR)/HiprandRandomNumberKernel.o: CXXFLAGS += $(RNDCXXFLAGS) +$(BUILDDIR)/HiprandRandomNumberKernel_cu.o: CUFLAGS += $(RNDCXXFLAGS) +ifeq ($(HASCURAND),hasCurand) # curand headers, #679 $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUINC) endif +ifeq ($(HASHIPRAND),hasHiprand) # hiprand headers +$(BUILDDIR)/HiprandRandomNumberKernel.o: CXXFLAGS += $(HIPINC) +endif # Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592) ifneq ($(shell $(CXX) --version | egrep '^(Intel)'),) @@ -673,8 +725,8 @@ endif # Target (and build rules): C++ and CUDA standalone executables $(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o - $(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CURANDLIBFLAGS) +$(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(BUILDDIR)/HiprandRandomNumberKernel.o + $(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(BUILDDIR)/HiprandRandomNumberKernel.o $(RNDLIBFLAGS) ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) @@ -684,8 +736,8 @@ else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(cu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif $(cu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(cu_main): $(BUILDDIR)/check_sa_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_cu.o - $(GPUCC) -o $@ $(BUILDDIR)/check_sa_cu.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_cu.o $(CURANDLIBFLAGS) +$(cu_main): $(BUILDDIR)/check_sa_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_cu.o $(BUILDDIR)/HiprandRandomNumberKernel_cu.o + $(GPUCC) -o $@ $(BUILDDIR)/check_sa_cu.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_cu.o $(BUILDDIR)/HiprandRandomNumberKernel_cu.o $(RNDLIBFLAGS) endif #------------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/runTest.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/runTest.cc index de327f2321..7f248d29a4 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/runTest.cc +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/runTest.cc @@ -238,7 +238,7 @@ struct CUDATest : public CUDA_CPU_TestBase return MemoryAccessMatrixElements::ieventAccessConst( hstMatrixElements.data(), ievt ); } }; -#endif +#endif /* clang-format off */ // Use two levels of macros to force stringification at the right level // (see https://gcc.gnu.org/onlinedocs/gcc-3.0.1/cpp_3.html#SEC17 and https://stackoverflow.com/a/3419392) @@ -260,4 +260,4 @@ INSTANTIATE_TEST_SUITE_P( prefix, \ MG_INSTANTIATE_TEST_SUITE_GPU( XTESTID_GPU( MG_EPOCH_PROCESS_ID ), MadgraphTest ); #else MG_INSTANTIATE_TEST_SUITE_CPU( XTESTID_CPU( MG_EPOCH_PROCESS_ID ), MadgraphTest ); -#endif +#endif /* clang-format on */ diff --git a/epochX/cudacpp/gq_ttq.mad/src/cudacpp_src.mk b/epochX/cudacpp/gq_ttq.mad/src/cudacpp_src.mk index b2b9da5288..d7d51259d2 100644 --- a/epochX/cudacpp/gq_ttq.mad/src/cudacpp_src.mk +++ b/epochX/cudacpp/gq_ttq.mad/src/cudacpp_src.mk @@ -45,13 +45,13 @@ endif #------------------------------------------------------------------------------- -#=== Configure the CUDA compiler (note: GPUCC is already exported including ccache) +#=== Configure the CUDA compiler (note: GPUCC have been exported from cudacpp.mk including ccache) ###$(info GPUCC=$(GPUCC)) #------------------------------------------------------------------------------- -#=== Configure ccache for C++ builds (note: GPUCC is already exported including ccache) +#=== Configure ccache for C++ builds (note: GPUCC have been exported from cudacpp.mk including ccache) # Enable ccache if USECCACHE=1 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) @@ -86,7 +86,8 @@ endif #------------------------------------------------------------------------------- -#=== Set the CUDA/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN +#=== Set the CUDA/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD (exported from cudacpp.mk) +#=== (NB the RNDCXXFLAGS and RNDLIBFLAGS appropriate to user-defined choices of HASCURAND and HASHIPRAND have been exported from cudacpp.mk) # Set the build flags appropriate to OMPFLAGS ###$(info OMPFLAGS=$(OMPFLAGS)) @@ -175,14 +176,6 @@ else ifneq ($(HRDCOD),0) $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported) endif -# Set the build flags appropriate to each RNDGEN choice (example: "make RNDGEN=hasNoCurand") -###$(info RNDGEN=$(RNDGEN)) -ifeq ($(RNDGEN),hasNoCurand) - CXXFLAGS += -DMGONGPU_HAS_NO_CURAND -else ifneq ($(RNDGEN),hasCurand) - $(error Unknown RNDGEN='$(RNDGEN)': only 'hasCurand' and 'hasNoCurand' are supported) -endif - #------------------------------------------------------------------------------- #=== Configure build directories and build lockfiles === @@ -193,7 +186,7 @@ override DIRTAG = $(AVX)_$(FPTYPE)_inl$(HELINL)_hrd$(HRDCOD) # Build lockfile "full" tag (defines full specification of build options that cannot be intermixed) # (Rationale: avoid mixing of CUDA and no-CUDA environment builds with different random number generators) -override TAG = $(AVX)_$(FPTYPE)_inl$(HELINL)_hrd$(HRDCOD)_$(RNDGEN) +override TAG = $(AVX)_$(FPTYPE)_inl$(HELINL)_hrd$(HRDCOD)_$(HASCURAND)_$(HASHIPRAND) # Build directory: current directory by default, or build.$(DIRTAG) if USEBUILDDIR==1 ###$(info Current directory is $(shell pwd)) diff --git a/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuConfig.h index 6bde4466d0..33306bc3e2 100644 --- a/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuConfig.h @@ -20,11 +20,15 @@ #undef MGONGPUCPP_GPUIMPL #endif +// Make sure that __HIP_PLATFORM_NVIDIA__ is undefined +// (__HIP_PLATFORM_AMD__ is defined by hipcc or in HiprandRandomNumberKernel.cc) +#undef __HIP_PLATFORM_NVIDIA__ // disable hiprand for NVidia (curand) + // ** NB1 Throughputs (e.g. 6.8E8) are events/sec for "./gcheck.exe -p 65536 128 12" // ** NB2 Baseline on b7g47n0004 fluctuates (probably depends on load on other VMs) // Choose if curand is supported for generating random numbers -// For HIP, by default, do not use curand (common random numbers will be used instead) +// For HIP, by default, do not allow curand to be used (hiprand or common random numbers will be used instead) // For both CUDA and C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND // (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784 and #785) #if defined __HIPCC__ @@ -39,6 +43,22 @@ //#endif #endif +// Choose if hiprand is supported for generating random numbers +// For CUDA, by default, do not allow hiprand to be used (curand or common random numbers will be used instead) +// For both HIP and C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_HIPRAND +// (there may exist HIP installations which do not include hiprand?) +#if defined __CUDACC__ +#define MGONGPU_HAS_NO_HIPRAND 1 +#else +//#ifdef __HIPCC__ +//#undef MGONGPU_HAS_NO_HIPRAND // default +////#define MGONGPU_HAS_NO_HIPRAND 1 +//#else +//#undef MGONGPU_HAS_NO_HIPRAND // default +////#define MGONGPU_HAS_NO_HIPRAND 1 +//#endif +#endif + // Choose floating point precision (for everything but color algebra #537) // If one of these macros has been set from outside with e.g. -DMGONGPU_FPTYPE_FLOAT, nothing happens (issue #167) #if not defined MGONGPU_FPTYPE_DOUBLE and not defined MGONGPU_FPTYPE_FLOAT diff --git a/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt b/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt index 3a6e2d99fc..3957e3a7d6 100644 --- a/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt +++ b/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt @@ -61,7 +61,7 @@ set zerowidth_tchannel F define q = u c d s u~ c~ d~ s~ INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005571842193603516  +DEBUG: model prefixing takes 0.00551295280456543  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -170,7 +170,7 @@ INFO: Crossed process found for g u~ > t t~ u~, reuse diagrams. INFO: Crossed process found for g c~ > t t~ c~, reuse diagrams. INFO: Crossed process found for g d~ > t t~ d~, reuse diagrams. INFO: Crossed process found for g s~ > t t~ s~, reuse diagrams. -8 processes with 40 diagrams generated in 0.079 s +8 processes with 40 diagrams generated in 0.077 s Total: 8 processes with 40 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gq_ttq Load PLUGIN.CUDACPP_OUTPUT @@ -178,8 +178,8 @@ Load PLUGIN.CUDACPP_OUTPUT It has been validated for the last time with version: 3.5.2 Output will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: cformat =  plugin [export_cpp.py at line 3071]  -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 160]  -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 165]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 161]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 166]  INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g u > t t~ u WEIGHTED<=3 @1 @@ -192,28 +192,28 @@ INFO: Processing color information for process: g u~ > t t~ u~ @1 INFO: Combined process g c~ > t t~ c~ WEIGHTED<=3 @1 with process g u~ > t t~ u~ WEIGHTED<=3 @1 INFO: Combined process g d~ > t t~ d~ WEIGHTED<=3 @1 with process g u~ > t t~ u~ WEIGHTED<=3 @1 INFO: Combined process g s~ > t t~ s~ WEIGHTED<=3 @1 with process g u~ > t t~ u~ WEIGHTED<=3 @1 -DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 194]  -DEBUG: type(subproc_group)= [output.py at line 195]  -DEBUG: type(fortran_model)= [output.py at line 196]  -DEBUG: type(me)= me=0 [output.py at line 197]  +DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 195]  +DEBUG: type(subproc_group)= [output.py at line 196]  +DEBUG: type(fortran_model)= [output.py at line 197]  +DEBUG: type(me)= me=0 [output.py at line 198]  INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu/./CPPProcess.h FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu/./CPPProcess.cc INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu/. -DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 194]  -DEBUG: type(subproc_group)= [output.py at line 195]  -DEBUG: type(fortran_model)= [output.py at line 196]  -DEBUG: type(me)= me=1 [output.py at line 197]  +DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 195]  +DEBUG: type(subproc_group)= [output.py at line 196]  +DEBUG: type(fortran_model)= [output.py at line 197]  +DEBUG: type(me)= me=1 [output.py at line 198]  INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux/./CPPProcess.h FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux/./CPPProcess.cc INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux/. -Generated helas calls for 2 subprocesses (10 diagrams) in 0.031 s -DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202]  +Generated helas calls for 2 subprocesses (10 diagrams) in 0.030 s +DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 203]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVV1 routines -ALOHA: aloha creates 2 routines in 0.149 s +ALOHA: aloha creates 2 routines in 0.143 s FFV1 FFV1 FFV1 @@ -229,7 +229,7 @@ INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/. quit -real 0m0.672s -user 0m0.602s -sys 0m0.062s -Code generation completed in 0 seconds +real 0m1.009s +user 0m0.580s +sys 0m0.067s +Code generation completed in 1 seconds diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/CurandRandomNumberKernel.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/CurandRandomNumberKernel.cc index 08a16f6f2c..c160c5e06b 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/CurandRandomNumberKernel.cc +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/CurandRandomNumberKernel.cc @@ -10,6 +10,7 @@ #include #ifndef MGONGPU_HAS_NO_CURAND /* clang-format off */ +// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when MGONGPUCPP_GPUIMPL is not defined #include "curand.h" #define checkCurand( code ){ assertCurand( code, __FILE__, __LINE__ ); } inline void assertCurand( curandStatus_t code, const char *file, int line, bool abort = true ) diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/HiprandRandomNumberKernel.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/HiprandRandomNumberKernel.cc new file mode 100644 index 0000000000..2e4534f9d4 --- /dev/null +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/HiprandRandomNumberKernel.cc @@ -0,0 +1,145 @@ +// Copyright (C) 2020-2024 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Jan 2024) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2024) for the MG5aMC CUDACPP plugin. + +#include "mgOnGpuConfig.h" + +#include "GpuRuntime.h" +#include "MemoryBuffers.h" +#include "RandomNumberKernels.h" + +#include + +#ifndef MGONGPU_HAS_NO_HIPRAND /* clang-format off */ +#ifndef __HIP_PLATFORM_AMD__ +#define __HIP_PLATFORM_AMD__ 1 // enable hiprand for AMD (rocrand) +#endif +#include +#define checkHiprand( code ){ assertHiprand( code, __FILE__, __LINE__ ); } +inline void assertHiprand( hiprandStatus_t code, const char *file, int line, bool abort = true ) +{ + if ( code != HIPRAND_STATUS_SUCCESS ) + { + printf( "HiprandAssert: %s:%d code=%d\n", file, line, code ); + if ( abort ) assert( code == HIPRAND_STATUS_SUCCESS ); + } +} +#endif /* clang-format on */ + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + //-------------------------------------------------------------------------- +#ifndef MGONGPU_HAS_NO_HIPRAND + HiprandRandomNumberKernel::HiprandRandomNumberKernel( BufferRndNumMomenta& rnarray, const bool onDevice ) + : RandomNumberKernelBase( rnarray ) + , m_isOnDevice( onDevice ) + { + if( m_isOnDevice ) + { +#ifdef MGONGPUCPP_GPUIMPL + if( !m_rnarray.isOnDevice() ) + throw std::runtime_error( "HiprandRandomNumberKernel on device with a host random number array" ); +#else + throw std::runtime_error( "HiprandRandomNumberKernel does not support HiprandDevice on CPU host" ); +#endif + } + else + { + if( m_rnarray.isOnDevice() ) + throw std::runtime_error( "HiprandRandomNumberKernel on host with a device random number array" ); + } + createGenerator(); + } + + //-------------------------------------------------------------------------- + + HiprandRandomNumberKernel::~HiprandRandomNumberKernel() + { + destroyGenerator(); + } + + //-------------------------------------------------------------------------- + + void HiprandRandomNumberKernel::seedGenerator( const unsigned int seed ) + { + if( m_isOnDevice ) + { + destroyGenerator(); // workaround for #429 + createGenerator(); // workaround for #429 + } + //printf( "seedGenerator: seed %d\n", seed ); + checkHiprand( hiprandSetPseudoRandomGeneratorSeed( m_rnGen, seed ) ); + } + + //-------------------------------------------------------------------------- + + void HiprandRandomNumberKernel::createGenerator() + { + //const hiprandRngType_t type = HIPRAND_RNG_PSEUDO_DEFAULT; + //const hiprandRngType_t type = HIPRAND_RNG_PSEUDO_XORWOW; + //const hiprandRngType_t type = HIPRAND_RNG_PSEUDO_MRG32K3A; + const hiprandRngType_t type = HIPRAND_RNG_PSEUDO_MTGP32; // same as curand; not implemented yet (code=1000) in host code + //const hiprandRngType_t type = HIPRAND_RNG_PSEUDO_MT19937; + //const hiprandRngType_t type = HIPRAND_RNG_PSEUDO_PHILOX4_32_10; + if( m_isOnDevice ) + { + checkHiprand( hiprandCreateGenerator( &m_rnGen, type ) ); + } + else + { + // See https://github.com/ROCm/hipRAND/issues/76 + throw std::runtime_error( "HiprandRandomNumberKernel on host is not supported yet (hiprandCreateGeneratorHost is not implemented yet)" ); + //checkHiprand( hiprandCreateGeneratorHost( &m_rnGen, type ) ); // ALWAYS FAILS WITH CODE=1000 + } + // FIXME: hiprand ordering is not implemented yet + // See https://github.com/ROCm/hipRAND/issues/75 + /* + //checkHiprand( hiprandSetGeneratorOrdering( *&m_rnGen, HIPRAND_ORDERING_PSEUDO_LEGACY ) ); + checkHiprand( hiprandSetGeneratorOrdering( *&m_rnGen, HIPRAND_ORDERING_PSEUDO_BEST ) ); + //checkHiprand( hiprandSetGeneratorOrdering( *&m_rnGen, HIPRAND_ORDERING_PSEUDO_DYNAMIC ) ); + //checkHiprand( hiprandSetGeneratorOrdering( *&m_rnGen, HIPRAND_ORDERING_PSEUDO_SEEDED ) ); + */ + } + + //-------------------------------------------------------------------------- + + void HiprandRandomNumberKernel::destroyGenerator() + { + checkHiprand( hiprandDestroyGenerator( m_rnGen ) ); + } + + //-------------------------------------------------------------------------- + + void HiprandRandomNumberKernel::generateRnarray() + { +#if defined MGONGPU_FPTYPE_DOUBLE + checkHiprand( hiprandGenerateUniformDouble( m_rnGen, m_rnarray.data(), m_rnarray.size() ) ); +#elif defined MGONGPU_FPTYPE_FLOAT + checkHiprand( hiprandGenerateUniform( m_rnGen, m_rnarray.data(), m_rnarray.size() ) ); +#endif + /* + printf( "\nHiprandRandomNumberKernel::generateRnarray size = %d\n", (int)m_rnarray.size() ); + fptype* data = m_rnarray.data(); +#ifdef MGONGPUCPP_GPUIMPL + if( m_rnarray.isOnDevice() ) + { + data = new fptype[m_rnarray.size()](); + checkCuda( cudaMemcpy( data, m_rnarray.data(), m_rnarray.bytes(), cudaMemcpyDeviceToHost ) ); + } +#endif + for( int i = 0; i < ( (int)m_rnarray.size() / 4 ); i++ ) + printf( "[%4d] %f %f %f %f\n", i * 4, data[i * 4], data[i * 4 + 2], data[i * 4 + 2], data[i * 4 + 3] ); +#ifdef MGONGPUCPP_GPUIMPL + if( m_rnarray.isOnDevice() ) delete[] data; +#endif + */ + } + + //-------------------------------------------------------------------------- +#endif +} diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/HiprandRandomNumberKernel.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/HiprandRandomNumberKernel.cc new file mode 120000 index 0000000000..6691864f78 --- /dev/null +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/HiprandRandomNumberKernel.cc @@ -0,0 +1 @@ +../HiprandRandomNumberKernel.cc \ No newline at end of file diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/check_sa.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/check_sa.cc index bde384c69e..e086ae12d9 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/check_sa.cc +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/check_sa.cc @@ -1,10 +1,10 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2023 CERN and UCLouvain. +// Copyright (C) 2020-2024 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -58,7 +58,7 @@ int usage( char* argv0, int ret = 1 ) { std::cout << "Usage: " << argv0 - << " [--verbose|-v] [--debug|-d] [--performance|-p] [--json|-j] [--curhst|--curdev|--common] [--rmbhst|--rmbdev] [--bridge]" + << " [--verbose|-v] [--debug|-d] [--performance|-p] [--json|-j] [--curhst|--curdev|--hirhst|--hirdev|--common] [--rmbhst|--rmbdev] [--bridge]" << " [#gpuBlocksPerGrid #gpuThreadsPerBlock] #iterations" << std::endl; std::cout << std::endl; std::cout << "The number of events per iteration is #gpuBlocksPerGrid * #gpuThreadsPerBlock" << std::endl; @@ -131,17 +131,31 @@ main( int argc, char** argv ) enum class RandomNumberMode { CommonRandom = 0, - CurandHost = 1, - CurandDevice = 2 + CurandHost = -1, + CurandDevice = 1, + HiprandHost = -2, + HiprandDevice = 2 }; -#ifdef MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785) -#elif defined __HIPCC__ -#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random -#elif defined __CUDACC__ +#if defined __CUDACC__ +#ifndef MGONGPU_HAS_NO_CURAND RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand #else + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on NVidia GPU if build has no curand (PR #784 and #785) +#endif +#elif defined __HIPCC__ +#ifndef MGONGPU_HAS_NO_HIPRAND + RandomNumberMode rndgen = RandomNumberMode::HiprandDevice; // default on AMD GPU if build has hiprand +#else + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on AMD GPU if build has no hiprand +#endif +#else +#ifndef MGONGPU_HAS_NO_CURAND RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand +#elif not defined MGONGPU_HAS_NO_HIPRAND + RandomNumberMode rndgen = RandomNumberMode::HiprandDevice; // default on CPU if build has hiprand +#else + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has neither curand nor hiprand +#endif #endif // Rambo sampling mode (NB RamboHost implies CommonRandom or CurandHost!) enum class RamboSamplingMode @@ -152,7 +166,7 @@ main( int argc, char** argv ) #ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -193,6 +207,26 @@ main( int argc, char** argv ) throw std::runtime_error( "CurandHost is not supported because this application was built without Curand support" ); #else rndgen = RandomNumberMode::CurandHost; +#endif + } + else if( arg == "--hirdev" ) + { +#ifndef __HIPCC__ + throw std::runtime_error( "HiprandDevice is not supported on CPUs or non-AMD GPUs" ); +#elif defined MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "HiprandDevice is not supported because this application was built without Hiprand support" ); +#else + rndgen = RandomNumberMode::HiprandDevice; +#endif + } + else if( arg == "--hirhst" ) + { +#ifdef MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "HiprandHost is not supported because this application was built without Hiprand support" ); +#else + // See https://github.com/ROCm/hipRAND/issues/76 + throw std::runtime_error( "HiprandRandomNumberKernel on host is not supported yet (hiprandCreateGeneratorHost is not implemented yet)" ); + //rndgen = RandomNumberMode::HiprandHost; #endif } else if( arg == "--common" ) @@ -265,6 +299,20 @@ main( int argc, char** argv ) #endif } + if( rmbsmp == RamboSamplingMode::RamboHost && rndgen == RandomNumberMode::HiprandDevice ) + { +#if not defined MGONGPU_HAS_NO_HIPRAND + // See https://github.com/ROCm/hipRAND/issues/76 + //std::cout << "WARNING! RamboHost selected: cannot use HiprandDevice, will use HiprandHost" << std::endl; + //rndgen = RandomNumberMode::HiprandHost; + std::cout << "WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet)" << std::endl; + rndgen = RandomNumberMode::CommonRandom; +#else + std::cout << "WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom" << std::endl; + rndgen = RandomNumberMode::CommonRandom; +#endif + } + constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout constexpr int neppR = MemoryAccessRandomNumbers::neppR; // AOSOA layout @@ -415,7 +463,7 @@ main( int argc, char** argv ) std::unique_ptr wavetimes( new double[niter] ); std::unique_ptr wv3atimes( new double[niter] ); - // --- 0c. Create curand or common generator + // --- 0c. Create curand, hiprand or common generator const std::string cgenKey = "0c GenCreat"; timermap.start( cgenKey ); // Allocate the appropriate RandomNumberKernel @@ -433,7 +481,7 @@ main( int argc, char** argv ) prnk.reset( new CurandRandomNumberKernel( hstRndmom, onDevice ) ); #endif } - else + else if( rndgen == RandomNumberMode::CurandDevice ) { #ifdef MGONGPU_HAS_NO_CURAND throw std::runtime_error( "INTERNAL ERROR! CurandDevice is not supported because this application was built without Curand support" ); // INTERNAL ERROR (no path to this statement) @@ -441,9 +489,31 @@ main( int argc, char** argv ) const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); #else - throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif } + else if( rndgen == RandomNumberMode::HiprandHost ) + { +#ifdef MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "INTERNAL ERROR! HiprandHost is not supported because this application was built without Hiprand support" ); // INTERNAL ERROR (no path to this statement) +#else + const bool onDevice = false; + prnk.reset( new HiprandRandomNumberKernel( hstRndmom, onDevice ) ); +#endif + } + else if( rndgen == RandomNumberMode::HiprandDevice ) + { +#ifdef MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "INTERNAL ERROR! HiprandDevice is not supported because this application was built without Hiprand support" ); // INTERNAL ERROR (no path to this statement) +#elif defined __HIPCC__ + const bool onDevice = true; + prnk.reset( new HiprandRandomNumberKernel( devRndmom, onDevice ) ); +#else + throw std::logic_error( "INTERNAL ERROR! HiprandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) +#endif + } + else + throw std::logic_error( "INTERNAL ERROR! Unknown rndgen value?" ); // INTERNAL ERROR (no path to this statement) // --- 0c. Create rambo sampling kernel [keep this in 0c for the moment] std::unique_ptr prsk; @@ -497,7 +567,7 @@ main( int argc, char** argv ) // *** START THE OLD-STYLE TIMER FOR RANDOM GEN *** double genrtime = 0; - // --- 1a. Seed rnd generator (to get same results on host and device in curand) + // --- 1a. Seed rnd generator (to get same results on host and device in curand/hiprand) // [NB This should not be necessary using the host API: "Generation functions // can be called multiple times on the same generator to generate successive // blocks of results. For pseudorandom generators, multiple calls to generation @@ -515,7 +585,9 @@ main( int argc, char** argv ) //std::cout << "Got random numbers" << std::endl; #ifdef MGONGPUCPP_GPUIMPL - if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) + if( rndgen != RandomNumberMode::CurandDevice && + rndgen != RandomNumberMode::HiprandDevice && + rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device const std::string htodKey = "1c CpHTDrnd"; @@ -761,6 +833,10 @@ main( int argc, char** argv ) rndgentxt = "CURAND HOST"; else if( rndgen == RandomNumberMode::CurandDevice ) rndgentxt = "CURAND DEVICE"; + else if( rndgen == RandomNumberMode::HiprandHost ) + rndgentxt = "ROCRAND HOST"; + else if( rndgen == RandomNumberMode::HiprandDevice ) + rndgentxt = "ROCRAND DEVICE"; #ifdef __CUDACC__ rndgentxt += " (CUDA code)"; #elif defined __HIPCC__ @@ -788,7 +864,7 @@ main( int argc, char** argv ) wrkflwtxt += "FLT+"; #else wrkflwtxt += "???+"; // no path to this statement -#endif /* clang-format on */ +#endif // -- CUCOMPLEX or THRUST or STD or CXSIMPLE complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -813,7 +889,7 @@ main( int argc, char** argv ) wrkflwtxt += "CXS:"; #else wrkflwtxt += "???:"; // no path to this statement -#endif +#endif /* clang-format on */ #endif // -- COMMON or CURAND HOST or CURAND DEVICE random numbers? if( rndgen == RandomNumberMode::CommonRandom ) @@ -822,6 +898,10 @@ main( int argc, char** argv ) wrkflwtxt += "CURHST+"; else if( rndgen == RandomNumberMode::CurandDevice ) wrkflwtxt += "CURDEV+"; + else if( rndgen == RandomNumberMode::HiprandHost ) + wrkflwtxt += "HIRHST+"; + else if( rndgen == RandomNumberMode::HiprandDevice ) + wrkflwtxt += "HIRDEV+"; else wrkflwtxt += "??????+"; // no path to this statement // -- HOST or DEVICE rambo sampling? @@ -1099,7 +1179,7 @@ main( int argc, char** argv ) #ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif - << "\"Curand generation\": " + << "\"Random generation\": " << "\"" << rndgentxt << "\"," << std::endl; double minelem = hstStats.minME; diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/HiprandRandomNumberKernel.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/HiprandRandomNumberKernel.cc new file mode 120000 index 0000000000..6691864f78 --- /dev/null +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/HiprandRandomNumberKernel.cc @@ -0,0 +1 @@ +../HiprandRandomNumberKernel.cc \ No newline at end of file diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/check_sa.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/check_sa.cc index bde384c69e..e086ae12d9 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/check_sa.cc +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/check_sa.cc @@ -1,10 +1,10 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2023 CERN and UCLouvain. +// Copyright (C) 2020-2024 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -58,7 +58,7 @@ int usage( char* argv0, int ret = 1 ) { std::cout << "Usage: " << argv0 - << " [--verbose|-v] [--debug|-d] [--performance|-p] [--json|-j] [--curhst|--curdev|--common] [--rmbhst|--rmbdev] [--bridge]" + << " [--verbose|-v] [--debug|-d] [--performance|-p] [--json|-j] [--curhst|--curdev|--hirhst|--hirdev|--common] [--rmbhst|--rmbdev] [--bridge]" << " [#gpuBlocksPerGrid #gpuThreadsPerBlock] #iterations" << std::endl; std::cout << std::endl; std::cout << "The number of events per iteration is #gpuBlocksPerGrid * #gpuThreadsPerBlock" << std::endl; @@ -131,17 +131,31 @@ main( int argc, char** argv ) enum class RandomNumberMode { CommonRandom = 0, - CurandHost = 1, - CurandDevice = 2 + CurandHost = -1, + CurandDevice = 1, + HiprandHost = -2, + HiprandDevice = 2 }; -#ifdef MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785) -#elif defined __HIPCC__ -#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random -#elif defined __CUDACC__ +#if defined __CUDACC__ +#ifndef MGONGPU_HAS_NO_CURAND RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand #else + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on NVidia GPU if build has no curand (PR #784 and #785) +#endif +#elif defined __HIPCC__ +#ifndef MGONGPU_HAS_NO_HIPRAND + RandomNumberMode rndgen = RandomNumberMode::HiprandDevice; // default on AMD GPU if build has hiprand +#else + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on AMD GPU if build has no hiprand +#endif +#else +#ifndef MGONGPU_HAS_NO_CURAND RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand +#elif not defined MGONGPU_HAS_NO_HIPRAND + RandomNumberMode rndgen = RandomNumberMode::HiprandDevice; // default on CPU if build has hiprand +#else + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has neither curand nor hiprand +#endif #endif // Rambo sampling mode (NB RamboHost implies CommonRandom or CurandHost!) enum class RamboSamplingMode @@ -152,7 +166,7 @@ main( int argc, char** argv ) #ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -193,6 +207,26 @@ main( int argc, char** argv ) throw std::runtime_error( "CurandHost is not supported because this application was built without Curand support" ); #else rndgen = RandomNumberMode::CurandHost; +#endif + } + else if( arg == "--hirdev" ) + { +#ifndef __HIPCC__ + throw std::runtime_error( "HiprandDevice is not supported on CPUs or non-AMD GPUs" ); +#elif defined MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "HiprandDevice is not supported because this application was built without Hiprand support" ); +#else + rndgen = RandomNumberMode::HiprandDevice; +#endif + } + else if( arg == "--hirhst" ) + { +#ifdef MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "HiprandHost is not supported because this application was built without Hiprand support" ); +#else + // See https://github.com/ROCm/hipRAND/issues/76 + throw std::runtime_error( "HiprandRandomNumberKernel on host is not supported yet (hiprandCreateGeneratorHost is not implemented yet)" ); + //rndgen = RandomNumberMode::HiprandHost; #endif } else if( arg == "--common" ) @@ -265,6 +299,20 @@ main( int argc, char** argv ) #endif } + if( rmbsmp == RamboSamplingMode::RamboHost && rndgen == RandomNumberMode::HiprandDevice ) + { +#if not defined MGONGPU_HAS_NO_HIPRAND + // See https://github.com/ROCm/hipRAND/issues/76 + //std::cout << "WARNING! RamboHost selected: cannot use HiprandDevice, will use HiprandHost" << std::endl; + //rndgen = RandomNumberMode::HiprandHost; + std::cout << "WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet)" << std::endl; + rndgen = RandomNumberMode::CommonRandom; +#else + std::cout << "WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom" << std::endl; + rndgen = RandomNumberMode::CommonRandom; +#endif + } + constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout constexpr int neppR = MemoryAccessRandomNumbers::neppR; // AOSOA layout @@ -415,7 +463,7 @@ main( int argc, char** argv ) std::unique_ptr wavetimes( new double[niter] ); std::unique_ptr wv3atimes( new double[niter] ); - // --- 0c. Create curand or common generator + // --- 0c. Create curand, hiprand or common generator const std::string cgenKey = "0c GenCreat"; timermap.start( cgenKey ); // Allocate the appropriate RandomNumberKernel @@ -433,7 +481,7 @@ main( int argc, char** argv ) prnk.reset( new CurandRandomNumberKernel( hstRndmom, onDevice ) ); #endif } - else + else if( rndgen == RandomNumberMode::CurandDevice ) { #ifdef MGONGPU_HAS_NO_CURAND throw std::runtime_error( "INTERNAL ERROR! CurandDevice is not supported because this application was built without Curand support" ); // INTERNAL ERROR (no path to this statement) @@ -441,9 +489,31 @@ main( int argc, char** argv ) const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); #else - throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif } + else if( rndgen == RandomNumberMode::HiprandHost ) + { +#ifdef MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "INTERNAL ERROR! HiprandHost is not supported because this application was built without Hiprand support" ); // INTERNAL ERROR (no path to this statement) +#else + const bool onDevice = false; + prnk.reset( new HiprandRandomNumberKernel( hstRndmom, onDevice ) ); +#endif + } + else if( rndgen == RandomNumberMode::HiprandDevice ) + { +#ifdef MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "INTERNAL ERROR! HiprandDevice is not supported because this application was built without Hiprand support" ); // INTERNAL ERROR (no path to this statement) +#elif defined __HIPCC__ + const bool onDevice = true; + prnk.reset( new HiprandRandomNumberKernel( devRndmom, onDevice ) ); +#else + throw std::logic_error( "INTERNAL ERROR! HiprandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) +#endif + } + else + throw std::logic_error( "INTERNAL ERROR! Unknown rndgen value?" ); // INTERNAL ERROR (no path to this statement) // --- 0c. Create rambo sampling kernel [keep this in 0c for the moment] std::unique_ptr prsk; @@ -497,7 +567,7 @@ main( int argc, char** argv ) // *** START THE OLD-STYLE TIMER FOR RANDOM GEN *** double genrtime = 0; - // --- 1a. Seed rnd generator (to get same results on host and device in curand) + // --- 1a. Seed rnd generator (to get same results on host and device in curand/hiprand) // [NB This should not be necessary using the host API: "Generation functions // can be called multiple times on the same generator to generate successive // blocks of results. For pseudorandom generators, multiple calls to generation @@ -515,7 +585,9 @@ main( int argc, char** argv ) //std::cout << "Got random numbers" << std::endl; #ifdef MGONGPUCPP_GPUIMPL - if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) + if( rndgen != RandomNumberMode::CurandDevice && + rndgen != RandomNumberMode::HiprandDevice && + rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device const std::string htodKey = "1c CpHTDrnd"; @@ -761,6 +833,10 @@ main( int argc, char** argv ) rndgentxt = "CURAND HOST"; else if( rndgen == RandomNumberMode::CurandDevice ) rndgentxt = "CURAND DEVICE"; + else if( rndgen == RandomNumberMode::HiprandHost ) + rndgentxt = "ROCRAND HOST"; + else if( rndgen == RandomNumberMode::HiprandDevice ) + rndgentxt = "ROCRAND DEVICE"; #ifdef __CUDACC__ rndgentxt += " (CUDA code)"; #elif defined __HIPCC__ @@ -788,7 +864,7 @@ main( int argc, char** argv ) wrkflwtxt += "FLT+"; #else wrkflwtxt += "???+"; // no path to this statement -#endif /* clang-format on */ +#endif // -- CUCOMPLEX or THRUST or STD or CXSIMPLE complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -813,7 +889,7 @@ main( int argc, char** argv ) wrkflwtxt += "CXS:"; #else wrkflwtxt += "???:"; // no path to this statement -#endif +#endif /* clang-format on */ #endif // -- COMMON or CURAND HOST or CURAND DEVICE random numbers? if( rndgen == RandomNumberMode::CommonRandom ) @@ -822,6 +898,10 @@ main( int argc, char** argv ) wrkflwtxt += "CURHST+"; else if( rndgen == RandomNumberMode::CurandDevice ) wrkflwtxt += "CURDEV+"; + else if( rndgen == RandomNumberMode::HiprandHost ) + wrkflwtxt += "HIRHST+"; + else if( rndgen == RandomNumberMode::HiprandDevice ) + wrkflwtxt += "HIRDEV+"; else wrkflwtxt += "??????+"; // no path to this statement // -- HOST or DEVICE rambo sampling? @@ -1099,7 +1179,7 @@ main( int argc, char** argv ) #ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif - << "\"Curand generation\": " + << "\"Random generation\": " << "\"" << rndgentxt << "\"," << std::endl; double minelem = hstStats.minME; diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/RandomNumberKernels.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/RandomNumberKernels.h index 21d63beeac..7ed728a26c 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/RandomNumberKernels.h +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/RandomNumberKernels.h @@ -1,21 +1,22 @@ -// Copyright (C) 2020-2023 CERN and UCLouvain. +// Copyright (C) 2020-2024 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. #ifndef RANDOMNUMBERKERNELS_H #define RANDOMNUMBERKERNELS_H 1 #include "mgOnGpuConfig.h" -// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when MGONGPUCPP_GPUIMPL is not defined -#ifndef MGONGPU_HAS_NO_CURAND -//#include "curand.h" -struct curandGenerator_st; // forward definition from curand.h -#endif - #include "MemoryBuffers.h" +// Forward definition from curand.h (the full header is only needed in CurandRandomKernel.cc) +struct curandGenerator_st; + +// Forward definition from hiprand.h (the full header is only needed in HiprandRandomKernel.cc) +struct rocrand_generator_base_type; +typedef rocrand_generator_base_type hiprandGenerator_st; + #ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else @@ -107,7 +108,6 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef MGONGPU_HAS_NO_CURAND // A class encapsulating CURAND random number generation on a CPU host or on a GPU device class CurandRandomNumberKernel final : public RandomNumberKernelBase { @@ -142,11 +142,49 @@ namespace mg5amcCpu const bool m_isOnDevice; // The curand generator - // (NB: curand.h defines typedef generator_t as a pointer to forward-defined 'struct curandGenerator_st') + // (NB: curand.h defines typedef curandGenerator_t as a pointer to forward-defined 'struct curandGenerator_st') curandGenerator_st* m_rnGen; }; -#endif + //-------------------------------------------------------------------------- + + // A class encapsulating HIPRAND random number generation on a CPU host or on a GPU device + class HiprandRandomNumberKernel final : public RandomNumberKernelBase + { + public: + + // Constructor from an existing output buffer + HiprandRandomNumberKernel( BufferRndNumMomenta& rnarray, const bool onDevice ); + + // Destructor + ~HiprandRandomNumberKernel(); + + // Seed the random number generator + void seedGenerator( const unsigned int seed ) override final; + + // Generate the random number array + void generateRnarray() override final; + + // Is this a host or device kernel? + bool isOnDevice() const override final { return m_isOnDevice; } + + private: + + // Create the generator (workaround for #429: do this in every seedGenerator call rather than only in the ctor) + void createGenerator(); + + // Destroy the generator (workaround for #429: do this in every seedGenerator call rather than only in the ctor) + void destroyGenerator(); + + private: + + // Is this a host or device kernel? + const bool m_isOnDevice; + + // The hiprand generator + // (NB: hiprand.h defines typedef hiprandGenerator_t as a pointer to forward-defined 'struct hiprandGenerator_st') + hiprandGenerator_st* m_rnGen; + }; //-------------------------------------------------------------------------- } diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp.mk index 1077bdc098..3ad91dfd59 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp.mk @@ -173,11 +173,6 @@ ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) comma:=, CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch)) CUINC = -I$(CUDA_HOME)/include/ - ifeq ($(RNDGEN),hasNoCurand) - CURANDLIBFLAGS= - else - CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here! - endif CUOPTFLAGS = -lineinfo ###GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math GPUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math @@ -241,7 +236,7 @@ else override GPUCC= override USE_NVTX= override CUINC= - override CURANDLIBFLAGS= + override HIPINC= endif @@ -291,7 +286,7 @@ endif #------------------------------------------------------------------------------- -#=== Configure defaults and check if user-defined choices exist for OMPFLAGS, AVX, FPTYPE, HELINL, HRDCOD, RNDGEN +#=== Configure defaults and check if user-defined choices exist for OMPFLAGS, AVX, FPTYPE, HELINL, HRDCOD # Set the default OMPFLAGS choice ifneq ($(findstring hipcc,$(GPUCC)),) @@ -352,29 +347,62 @@ ifeq ($(HRDCOD),) override HRDCOD = 0 endif -# Set the default RNDGEN (random number generator) choice -ifeq ($(RNDGEN),) - ifeq ($(GPUCC),) - override RNDGEN = hasNoCurand - # Edgecase for HIP compilation - else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) - override RNDGEN = hasNoCurand - else ifeq ($(RNDGEN),) - override RNDGEN = hasCurand - endif -endif - -# Export AVX, FPTYPE, HELINL, HRDCOD, RNDGEN, OMPFLAGS so that it is not necessary to pass them to the src Makefile too +# Export AVX, FPTYPE, HELINL, HRDCOD, OMPFLAGS so that it is not necessary to pass them to the src Makefile too export AVX export FPTYPE export HELINL export HRDCOD -export RNDGEN export OMPFLAGS #------------------------------------------------------------------------------- -#=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN +#=== Configure defaults and check if user-defined choices exist for RNDGEN (legacy!), HASCURAND, HASHIPRAND + +# If the legacy RNDGEN exists, this take precedence over any HASCURAND choice (but a warning is printed out) +###$(info RNDGEN=$(RNDGEN)) +ifneq ($(RNDGEN),) + $(warning Environment variable RNDGEN is no longer supported, please use HASCURAND instead!) + ifeq ($(RNDGEN),hasCurand) + override HASCURAND = $(RNDGEN) + else ifeq ($(RNDGEN),hasNoCurand) + override HASCURAND = $(RNDGEN) + else ifneq ($(RNDGEN),hasNoCurand) + $(error Unknown RNDGEN='$(RNDGEN)': only 'hasCurand' and 'hasNoCurand' are supported - but use HASCURAND instead!) + endif +endif + +# Set the default HASCURAND (curand random number generator) choice, if no prior choice exists for HASCURAND +# (NB: allow HASCURAND=hasCurand even if $(GPUCC) does not point to nvcc: assume CUDA_HOME was defined correctly...) +ifeq ($(HASCURAND),) + ifeq ($(GPUCC),) # CPU-only build + override HASCURAND = hasNoCurand + else ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build + override HASCURAND = hasCurand + else # non-Nvidia GPU build + override HASCURAND = hasNoCurand + endif +endif + +# Set the default HASHIPRAND (hiprand random number generator) choice, if no prior choice exists for HASHIPRAND +# (NB: allow HASHIPRAND=hasHiprand even if $(GPUCC) does not point to hipcc: assume HIP_HOME was defined correctly...) +ifeq ($(HASHIPRAND),) + ifeq ($(GPUCC),) # CPU-only build + override HASHIPRAND = hasNoHiprand + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build + override HASHIPRAND = hasHiprand + else # non-AMD GPU build + override HASHIPRAND = hasNoHiprand + endif +endif + +# Export HASCURAND, HASHIPRAND so that it is not necessary to pass them to the src Makefile too +# (NB: these variables in cudacpp_src.mk are only used to define the build tag, they are NOT needed for RNDCXXFLAGS or RNDLIBFLAGS) +export HASCURAND +export HASHIPRAND + +#------------------------------------------------------------------------------- + +#=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD # Set the build flags appropriate to OMPFLAGS $(info OMPFLAGS=$(OMPFLAGS)) @@ -432,13 +460,13 @@ CXXFLAGS+= $(AVXFLAGS) $(info FPTYPE=$(FPTYPE)) ifeq ($(FPTYPE),d) CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE - GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE + GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE else ifeq ($(FPTYPE),f) CXXFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT - GPUFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT + GPUFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT else ifeq ($(FPTYPE),m) CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT - GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT + GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT else $(error Unknown FPTYPE='$(FPTYPE)': only 'd', 'f' and 'm' are supported) endif @@ -447,7 +475,7 @@ endif $(info HELINL=$(HELINL)) ifeq ($(HELINL),1) CXXFLAGS += -DMGONGPU_INLINE_HELAMPS - GPUFLAGS += -DMGONGPU_INLINE_HELAMPS + GPUFLAGS += -DMGONGPU_INLINE_HELAMPS else ifneq ($(HELINL),0) $(error Unknown HELINL='$(HELINL)': only '0' and '1' are supported) endif @@ -456,21 +484,40 @@ endif $(info HRDCOD=$(HRDCOD)) ifeq ($(HRDCOD),1) CXXFLAGS += -DMGONGPU_HARDCODE_PARAM - GPUFLAGS += -DMGONGPU_HARDCODE_PARAM + GPUFLAGS += -DMGONGPU_HARDCODE_PARAM else ifneq ($(HRDCOD),0) $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported) endif -# Set the build flags appropriate to each RNDGEN choice (example: "make RNDGEN=hasNoCurand") -$(info RNDGEN=$(RNDGEN)) -ifeq ($(RNDGEN),hasNoCurand) - override CXXFLAGSCURAND = -DMGONGPU_HAS_NO_CURAND -else ifeq ($(RNDGEN),hasCurand) - override CXXFLAGSCURAND = + +#=== Set the CUDA/HIP/C++ compiler and linker flags appropriate to user-defined choices of HASCURAND, HASHIPRAND + +$(info HASCURAND=$(HASCURAND)) +$(info HASHIPRAND=$(HASHIPRAND)) +override RNDCXXFLAGS= +override RNDLIBFLAGS= + +# Set the RNDCXXFLAGS and RNDLIBFLAGS build flags appropriate to each HASCURAND choice (example: "make HASCURAND=hasNoCurand") +ifeq ($(HASCURAND),hasNoCurand) + override RNDCXXFLAGS += -DMGONGPU_HAS_NO_CURAND +else ifeq ($(HASCURAND),hasCurand) + override RNDLIBFLAGS += -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here! else - $(error Unknown RNDGEN='$(RNDGEN)': only 'hasCurand' and 'hasNoCurand' are supported) + $(error Unknown HASCURAND='$(HASCURAND)': only 'hasCurand' and 'hasNoCurand' are supported) +endif + +# Set the RNDCXXFLAGS and RNDLIBFLAGS build flags appropriate to each HASHIPRAND choice (example: "make HASHIPRAND=hasNoHiprand") +ifeq ($(HASHIPRAND),hasNoHiprand) + override RNDCXXFLAGS += -DMGONGPU_HAS_NO_HIPRAND +else ifeq ($(HASHIPRAND),hasHiprand) + override RNDLIBFLAGS += -L$(HIP_HOME)/lib/ -lhiprand +else ifneq ($(HASHIPRAND),hasHiprand) + $(error Unknown HASHIPRAND='$(HASHIPRAND)': only 'hasHiprand' and 'hasNoHiprand' are supported) endif +#$(info RNDCXXFLAGS=$(RNDCXXFLAGS)) +#$(info HASHIPRAND=$(HASHIPRAND)) + #------------------------------------------------------------------------------- #=== Configure build directories and build lockfiles === @@ -481,7 +528,7 @@ override DIRTAG = $(AVX)_$(FPTYPE)_inl$(HELINL)_hrd$(HRDCOD) # Build lockfile "full" tag (defines full specification of build options that cannot be intermixed) # (Rationale: avoid mixing of CUDA and no-CUDA environment builds with different random number generators) -override TAG = $(AVX)_$(FPTYPE)_inl$(HELINL)_hrd$(HRDCOD)_$(RNDGEN) +override TAG = $(AVX)_$(FPTYPE)_inl$(HELINL)_hrd$(HRDCOD)_$(HASCURAND)_$(HASHIPRAND) # Build directory: current directory by default, or build.$(DIRTAG) if USEBUILDDIR==1 ifeq ($(USEBUILDDIR),1) @@ -589,14 +636,19 @@ endif $(BUILDDIR)/check_sa.o: CXXFLAGS += $(USE_NVTX) $(CUINC) $(BUILDDIR)/check_sa_cu.o: CXXFLAGS += $(USE_NVTX) $(CUINC) -# Apply special build flags only to check_sa[_cu].o and CurandRandomNumberKernel[_cu].o (curand headers, #679) -$(BUILDDIR)/check_sa.o: CXXFLAGS += $(CXXFLAGSCURAND) -$(BUILDDIR)/check_sa_cu.o: CUFLAGS += $(CXXFLAGSCURAND) -$(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CXXFLAGSCURAND) -$(BUILDDIR)/CurandRandomNumberKernel_cu.o: CUFLAGS += $(CXXFLAGSCURAND) -ifeq ($(RNDGEN),hasCurand) +# Apply special build flags only to check_sa[_cu].o and (Cu|Hip)randRandomNumberKernel[_cu].o +$(BUILDDIR)/check_sa.o: CXXFLAGS += $(RNDCXXFLAGS) +$(BUILDDIR)/check_sa_cu.o: CUFLAGS += $(RNDCXXFLAGS) +$(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(RNDCXXFLAGS) +$(BUILDDIR)/CurandRandomNumberKernel_cu.o: CUFLAGS += $(RNDCXXFLAGS) +$(BUILDDIR)/HiprandRandomNumberKernel.o: CXXFLAGS += $(RNDCXXFLAGS) +$(BUILDDIR)/HiprandRandomNumberKernel_cu.o: CUFLAGS += $(RNDCXXFLAGS) +ifeq ($(HASCURAND),hasCurand) # curand headers, #679 $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUINC) endif +ifeq ($(HASHIPRAND),hasHiprand) # hiprand headers +$(BUILDDIR)/HiprandRandomNumberKernel.o: CXXFLAGS += $(HIPINC) +endif # Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592) ifneq ($(shell $(CXX) --version | egrep '^(Intel)'),) @@ -673,8 +725,8 @@ endif # Target (and build rules): C++ and CUDA standalone executables $(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o - $(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CURANDLIBFLAGS) +$(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(BUILDDIR)/HiprandRandomNumberKernel.o + $(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(BUILDDIR)/HiprandRandomNumberKernel.o $(RNDLIBFLAGS) ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) @@ -684,8 +736,8 @@ else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(cu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif $(cu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(cu_main): $(BUILDDIR)/check_sa_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_cu.o - $(GPUCC) -o $@ $(BUILDDIR)/check_sa_cu.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_cu.o $(CURANDLIBFLAGS) +$(cu_main): $(BUILDDIR)/check_sa_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_cu.o $(BUILDDIR)/HiprandRandomNumberKernel_cu.o + $(GPUCC) -o $@ $(BUILDDIR)/check_sa_cu.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_cu.o $(BUILDDIR)/HiprandRandomNumberKernel_cu.o $(RNDLIBFLAGS) endif #------------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/runTest.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/runTest.cc index de327f2321..7f248d29a4 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/runTest.cc +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/runTest.cc @@ -238,7 +238,7 @@ struct CUDATest : public CUDA_CPU_TestBase return MemoryAccessMatrixElements::ieventAccessConst( hstMatrixElements.data(), ievt ); } }; -#endif +#endif /* clang-format off */ // Use two levels of macros to force stringification at the right level // (see https://gcc.gnu.org/onlinedocs/gcc-3.0.1/cpp_3.html#SEC17 and https://stackoverflow.com/a/3419392) @@ -260,4 +260,4 @@ INSTANTIATE_TEST_SUITE_P( prefix, \ MG_INSTANTIATE_TEST_SUITE_GPU( XTESTID_GPU( MG_EPOCH_PROCESS_ID ), MadgraphTest ); #else MG_INSTANTIATE_TEST_SUITE_CPU( XTESTID_CPU( MG_EPOCH_PROCESS_ID ), MadgraphTest ); -#endif +#endif /* clang-format on */ diff --git a/epochX/cudacpp/gq_ttq.sa/src/cudacpp_src.mk b/epochX/cudacpp/gq_ttq.sa/src/cudacpp_src.mk index b2b9da5288..d7d51259d2 100644 --- a/epochX/cudacpp/gq_ttq.sa/src/cudacpp_src.mk +++ b/epochX/cudacpp/gq_ttq.sa/src/cudacpp_src.mk @@ -45,13 +45,13 @@ endif #------------------------------------------------------------------------------- -#=== Configure the CUDA compiler (note: GPUCC is already exported including ccache) +#=== Configure the CUDA compiler (note: GPUCC have been exported from cudacpp.mk including ccache) ###$(info GPUCC=$(GPUCC)) #------------------------------------------------------------------------------- -#=== Configure ccache for C++ builds (note: GPUCC is already exported including ccache) +#=== Configure ccache for C++ builds (note: GPUCC have been exported from cudacpp.mk including ccache) # Enable ccache if USECCACHE=1 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) @@ -86,7 +86,8 @@ endif #------------------------------------------------------------------------------- -#=== Set the CUDA/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN +#=== Set the CUDA/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD (exported from cudacpp.mk) +#=== (NB the RNDCXXFLAGS and RNDLIBFLAGS appropriate to user-defined choices of HASCURAND and HASHIPRAND have been exported from cudacpp.mk) # Set the build flags appropriate to OMPFLAGS ###$(info OMPFLAGS=$(OMPFLAGS)) @@ -175,14 +176,6 @@ else ifneq ($(HRDCOD),0) $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported) endif -# Set the build flags appropriate to each RNDGEN choice (example: "make RNDGEN=hasNoCurand") -###$(info RNDGEN=$(RNDGEN)) -ifeq ($(RNDGEN),hasNoCurand) - CXXFLAGS += -DMGONGPU_HAS_NO_CURAND -else ifneq ($(RNDGEN),hasCurand) - $(error Unknown RNDGEN='$(RNDGEN)': only 'hasCurand' and 'hasNoCurand' are supported) -endif - #------------------------------------------------------------------------------- #=== Configure build directories and build lockfiles === @@ -193,7 +186,7 @@ override DIRTAG = $(AVX)_$(FPTYPE)_inl$(HELINL)_hrd$(HRDCOD) # Build lockfile "full" tag (defines full specification of build options that cannot be intermixed) # (Rationale: avoid mixing of CUDA and no-CUDA environment builds with different random number generators) -override TAG = $(AVX)_$(FPTYPE)_inl$(HELINL)_hrd$(HRDCOD)_$(RNDGEN) +override TAG = $(AVX)_$(FPTYPE)_inl$(HELINL)_hrd$(HRDCOD)_$(HASCURAND)_$(HASHIPRAND) # Build directory: current directory by default, or build.$(DIRTAG) if USEBUILDDIR==1 ###$(info Current directory is $(shell pwd)) diff --git a/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuConfig.h index 475749ca7c..22d6921fda 100644 --- a/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuConfig.h @@ -20,11 +20,15 @@ #undef MGONGPUCPP_GPUIMPL #endif +// Make sure that __HIP_PLATFORM_NVIDIA__ is undefined +// (__HIP_PLATFORM_AMD__ is defined by hipcc or in HiprandRandomNumberKernel.cc) +#undef __HIP_PLATFORM_NVIDIA__ // disable hiprand for NVidia (curand) + // ** NB1 Throughputs (e.g. 6.8E8) are events/sec for "./gcheck.exe -p 65536 128 12" // ** NB2 Baseline on b7g47n0004 fluctuates (probably depends on load on other VMs) // Choose if curand is supported for generating random numbers -// For HIP, by default, do not use curand (common random numbers will be used instead) +// For HIP, by default, do not allow curand to be used (hiprand or common random numbers will be used instead) // For both CUDA and C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND // (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784 and #785) #if defined __HIPCC__ @@ -39,6 +43,22 @@ //#endif #endif +// Choose if hiprand is supported for generating random numbers +// For CUDA, by default, do not allow hiprand to be used (curand or common random numbers will be used instead) +// For both HIP and C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_HIPRAND +// (there may exist HIP installations which do not include hiprand?) +#if defined __CUDACC__ +#define MGONGPU_HAS_NO_HIPRAND 1 +#else +//#ifdef __HIPCC__ +//#undef MGONGPU_HAS_NO_HIPRAND // default +////#define MGONGPU_HAS_NO_HIPRAND 1 +//#else +//#undef MGONGPU_HAS_NO_HIPRAND // default +////#define MGONGPU_HAS_NO_HIPRAND 1 +//#endif +#endif + // Choose floating point precision (for everything but color algebra #537) // If one of these macros has been set from outside with e.g. -DMGONGPU_FPTYPE_FLOAT, nothing happens (issue #167) #if not defined MGONGPU_FPTYPE_DOUBLE and not defined MGONGPU_FPTYPE_FLOAT diff --git a/epochX/cudacpp/heft_gg_h.sa/CODEGEN_cudacpp_heft_gg_h_log.txt b/epochX/cudacpp/heft_gg_h.sa/CODEGEN_cudacpp_heft_gg_h_log.txt index 1512f69472..9d97c918db 100644 --- a/epochX/cudacpp/heft_gg_h.sa/CODEGEN_cudacpp_heft_gg_h_log.txt +++ b/epochX/cudacpp/heft_gg_h.sa/CODEGEN_cudacpp_heft_gg_h_log.txt @@ -137,25 +137,25 @@ Load PLUGIN.CUDACPP_OUTPUT It has been validated for the last time with version: 3.5.2 Output will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: cformat =  plugin [export_cpp.py at line 3071]  -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 160]  -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 165]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 161]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 166]  INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > h HIG<=1 HIW<=1 WEIGHTED<=2 @1 INFO: Processing color information for process: g g > h HIG<=1 HIW<=1 @1 -DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 194]  -DEBUG: type(subproc_group)= [output.py at line 195]  -DEBUG: type(fortran_model)= [output.py at line 196]  -DEBUG: type(me)= me=0 [output.py at line 197]  +DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 195]  +DEBUG: type(subproc_group)= [output.py at line 196]  +DEBUG: type(fortran_model)= [output.py at line 197]  +DEBUG: type(me)= me=0 [output.py at line 198]  INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h/SubProcesses/P1_Sigma_heft_gg_h FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h/SubProcesses/P1_Sigma_heft_gg_h/./CPPProcess.h FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h/SubProcesses/P1_Sigma_heft_gg_h/./CPPProcess.cc INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h/SubProcesses/P1_Sigma_heft_gg_h/. Generated helas calls for 1 subprocesses (1 diagrams) in 0.002 s -DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202]  +DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 203]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVS3 routines -ALOHA: aloha creates 1 routines in 0.062 s +ALOHA: aloha creates 1 routines in 0.061 s VVS3 FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h/src/./HelAmps_heft.h INFO: Created file HelAmps_heft.h in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h/src/. @@ -167,7 +167,7 @@ INFO: Created files Parameters_heft.h and Parameters_heft.cc in directory INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h/src/. quit -real 0m0.452s -user 0m0.372s -sys 0m0.059s +real 0m0.423s +user 0m0.361s +sys 0m0.053s Code generation completed in 0 seconds diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/CurandRandomNumberKernel.cc b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/CurandRandomNumberKernel.cc index 08a16f6f2c..c160c5e06b 100644 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/CurandRandomNumberKernel.cc +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/CurandRandomNumberKernel.cc @@ -10,6 +10,7 @@ #include #ifndef MGONGPU_HAS_NO_CURAND /* clang-format off */ +// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when MGONGPUCPP_GPUIMPL is not defined #include "curand.h" #define checkCurand( code ){ assertCurand( code, __FILE__, __LINE__ ); } inline void assertCurand( curandStatus_t code, const char *file, int line, bool abort = true ) diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/HiprandRandomNumberKernel.cc b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/HiprandRandomNumberKernel.cc new file mode 100644 index 0000000000..2e4534f9d4 --- /dev/null +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/HiprandRandomNumberKernel.cc @@ -0,0 +1,145 @@ +// Copyright (C) 2020-2024 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Jan 2024) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2024) for the MG5aMC CUDACPP plugin. + +#include "mgOnGpuConfig.h" + +#include "GpuRuntime.h" +#include "MemoryBuffers.h" +#include "RandomNumberKernels.h" + +#include + +#ifndef MGONGPU_HAS_NO_HIPRAND /* clang-format off */ +#ifndef __HIP_PLATFORM_AMD__ +#define __HIP_PLATFORM_AMD__ 1 // enable hiprand for AMD (rocrand) +#endif +#include +#define checkHiprand( code ){ assertHiprand( code, __FILE__, __LINE__ ); } +inline void assertHiprand( hiprandStatus_t code, const char *file, int line, bool abort = true ) +{ + if ( code != HIPRAND_STATUS_SUCCESS ) + { + printf( "HiprandAssert: %s:%d code=%d\n", file, line, code ); + if ( abort ) assert( code == HIPRAND_STATUS_SUCCESS ); + } +} +#endif /* clang-format on */ + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + //-------------------------------------------------------------------------- +#ifndef MGONGPU_HAS_NO_HIPRAND + HiprandRandomNumberKernel::HiprandRandomNumberKernel( BufferRndNumMomenta& rnarray, const bool onDevice ) + : RandomNumberKernelBase( rnarray ) + , m_isOnDevice( onDevice ) + { + if( m_isOnDevice ) + { +#ifdef MGONGPUCPP_GPUIMPL + if( !m_rnarray.isOnDevice() ) + throw std::runtime_error( "HiprandRandomNumberKernel on device with a host random number array" ); +#else + throw std::runtime_error( "HiprandRandomNumberKernel does not support HiprandDevice on CPU host" ); +#endif + } + else + { + if( m_rnarray.isOnDevice() ) + throw std::runtime_error( "HiprandRandomNumberKernel on host with a device random number array" ); + } + createGenerator(); + } + + //-------------------------------------------------------------------------- + + HiprandRandomNumberKernel::~HiprandRandomNumberKernel() + { + destroyGenerator(); + } + + //-------------------------------------------------------------------------- + + void HiprandRandomNumberKernel::seedGenerator( const unsigned int seed ) + { + if( m_isOnDevice ) + { + destroyGenerator(); // workaround for #429 + createGenerator(); // workaround for #429 + } + //printf( "seedGenerator: seed %d\n", seed ); + checkHiprand( hiprandSetPseudoRandomGeneratorSeed( m_rnGen, seed ) ); + } + + //-------------------------------------------------------------------------- + + void HiprandRandomNumberKernel::createGenerator() + { + //const hiprandRngType_t type = HIPRAND_RNG_PSEUDO_DEFAULT; + //const hiprandRngType_t type = HIPRAND_RNG_PSEUDO_XORWOW; + //const hiprandRngType_t type = HIPRAND_RNG_PSEUDO_MRG32K3A; + const hiprandRngType_t type = HIPRAND_RNG_PSEUDO_MTGP32; // same as curand; not implemented yet (code=1000) in host code + //const hiprandRngType_t type = HIPRAND_RNG_PSEUDO_MT19937; + //const hiprandRngType_t type = HIPRAND_RNG_PSEUDO_PHILOX4_32_10; + if( m_isOnDevice ) + { + checkHiprand( hiprandCreateGenerator( &m_rnGen, type ) ); + } + else + { + // See https://github.com/ROCm/hipRAND/issues/76 + throw std::runtime_error( "HiprandRandomNumberKernel on host is not supported yet (hiprandCreateGeneratorHost is not implemented yet)" ); + //checkHiprand( hiprandCreateGeneratorHost( &m_rnGen, type ) ); // ALWAYS FAILS WITH CODE=1000 + } + // FIXME: hiprand ordering is not implemented yet + // See https://github.com/ROCm/hipRAND/issues/75 + /* + //checkHiprand( hiprandSetGeneratorOrdering( *&m_rnGen, HIPRAND_ORDERING_PSEUDO_LEGACY ) ); + checkHiprand( hiprandSetGeneratorOrdering( *&m_rnGen, HIPRAND_ORDERING_PSEUDO_BEST ) ); + //checkHiprand( hiprandSetGeneratorOrdering( *&m_rnGen, HIPRAND_ORDERING_PSEUDO_DYNAMIC ) ); + //checkHiprand( hiprandSetGeneratorOrdering( *&m_rnGen, HIPRAND_ORDERING_PSEUDO_SEEDED ) ); + */ + } + + //-------------------------------------------------------------------------- + + void HiprandRandomNumberKernel::destroyGenerator() + { + checkHiprand( hiprandDestroyGenerator( m_rnGen ) ); + } + + //-------------------------------------------------------------------------- + + void HiprandRandomNumberKernel::generateRnarray() + { +#if defined MGONGPU_FPTYPE_DOUBLE + checkHiprand( hiprandGenerateUniformDouble( m_rnGen, m_rnarray.data(), m_rnarray.size() ) ); +#elif defined MGONGPU_FPTYPE_FLOAT + checkHiprand( hiprandGenerateUniform( m_rnGen, m_rnarray.data(), m_rnarray.size() ) ); +#endif + /* + printf( "\nHiprandRandomNumberKernel::generateRnarray size = %d\n", (int)m_rnarray.size() ); + fptype* data = m_rnarray.data(); +#ifdef MGONGPUCPP_GPUIMPL + if( m_rnarray.isOnDevice() ) + { + data = new fptype[m_rnarray.size()](); + checkCuda( cudaMemcpy( data, m_rnarray.data(), m_rnarray.bytes(), cudaMemcpyDeviceToHost ) ); + } +#endif + for( int i = 0; i < ( (int)m_rnarray.size() / 4 ); i++ ) + printf( "[%4d] %f %f %f %f\n", i * 4, data[i * 4], data[i * 4 + 2], data[i * 4 + 2], data[i * 4 + 3] ); +#ifdef MGONGPUCPP_GPUIMPL + if( m_rnarray.isOnDevice() ) delete[] data; +#endif + */ + } + + //-------------------------------------------------------------------------- +#endif +} diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/HiprandRandomNumberKernel.cc b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/HiprandRandomNumberKernel.cc new file mode 120000 index 0000000000..6691864f78 --- /dev/null +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/HiprandRandomNumberKernel.cc @@ -0,0 +1 @@ +../HiprandRandomNumberKernel.cc \ No newline at end of file diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/check_sa.cc b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/check_sa.cc index bde384c69e..e086ae12d9 100644 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/check_sa.cc +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/check_sa.cc @@ -1,10 +1,10 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2023 CERN and UCLouvain. +// Copyright (C) 2020-2024 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -58,7 +58,7 @@ int usage( char* argv0, int ret = 1 ) { std::cout << "Usage: " << argv0 - << " [--verbose|-v] [--debug|-d] [--performance|-p] [--json|-j] [--curhst|--curdev|--common] [--rmbhst|--rmbdev] [--bridge]" + << " [--verbose|-v] [--debug|-d] [--performance|-p] [--json|-j] [--curhst|--curdev|--hirhst|--hirdev|--common] [--rmbhst|--rmbdev] [--bridge]" << " [#gpuBlocksPerGrid #gpuThreadsPerBlock] #iterations" << std::endl; std::cout << std::endl; std::cout << "The number of events per iteration is #gpuBlocksPerGrid * #gpuThreadsPerBlock" << std::endl; @@ -131,17 +131,31 @@ main( int argc, char** argv ) enum class RandomNumberMode { CommonRandom = 0, - CurandHost = 1, - CurandDevice = 2 + CurandHost = -1, + CurandDevice = 1, + HiprandHost = -2, + HiprandDevice = 2 }; -#ifdef MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785) -#elif defined __HIPCC__ -#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random -#elif defined __CUDACC__ +#if defined __CUDACC__ +#ifndef MGONGPU_HAS_NO_CURAND RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand #else + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on NVidia GPU if build has no curand (PR #784 and #785) +#endif +#elif defined __HIPCC__ +#ifndef MGONGPU_HAS_NO_HIPRAND + RandomNumberMode rndgen = RandomNumberMode::HiprandDevice; // default on AMD GPU if build has hiprand +#else + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on AMD GPU if build has no hiprand +#endif +#else +#ifndef MGONGPU_HAS_NO_CURAND RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand +#elif not defined MGONGPU_HAS_NO_HIPRAND + RandomNumberMode rndgen = RandomNumberMode::HiprandDevice; // default on CPU if build has hiprand +#else + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has neither curand nor hiprand +#endif #endif // Rambo sampling mode (NB RamboHost implies CommonRandom or CurandHost!) enum class RamboSamplingMode @@ -152,7 +166,7 @@ main( int argc, char** argv ) #ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -193,6 +207,26 @@ main( int argc, char** argv ) throw std::runtime_error( "CurandHost is not supported because this application was built without Curand support" ); #else rndgen = RandomNumberMode::CurandHost; +#endif + } + else if( arg == "--hirdev" ) + { +#ifndef __HIPCC__ + throw std::runtime_error( "HiprandDevice is not supported on CPUs or non-AMD GPUs" ); +#elif defined MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "HiprandDevice is not supported because this application was built without Hiprand support" ); +#else + rndgen = RandomNumberMode::HiprandDevice; +#endif + } + else if( arg == "--hirhst" ) + { +#ifdef MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "HiprandHost is not supported because this application was built without Hiprand support" ); +#else + // See https://github.com/ROCm/hipRAND/issues/76 + throw std::runtime_error( "HiprandRandomNumberKernel on host is not supported yet (hiprandCreateGeneratorHost is not implemented yet)" ); + //rndgen = RandomNumberMode::HiprandHost; #endif } else if( arg == "--common" ) @@ -265,6 +299,20 @@ main( int argc, char** argv ) #endif } + if( rmbsmp == RamboSamplingMode::RamboHost && rndgen == RandomNumberMode::HiprandDevice ) + { +#if not defined MGONGPU_HAS_NO_HIPRAND + // See https://github.com/ROCm/hipRAND/issues/76 + //std::cout << "WARNING! RamboHost selected: cannot use HiprandDevice, will use HiprandHost" << std::endl; + //rndgen = RandomNumberMode::HiprandHost; + std::cout << "WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet)" << std::endl; + rndgen = RandomNumberMode::CommonRandom; +#else + std::cout << "WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom" << std::endl; + rndgen = RandomNumberMode::CommonRandom; +#endif + } + constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout constexpr int neppR = MemoryAccessRandomNumbers::neppR; // AOSOA layout @@ -415,7 +463,7 @@ main( int argc, char** argv ) std::unique_ptr wavetimes( new double[niter] ); std::unique_ptr wv3atimes( new double[niter] ); - // --- 0c. Create curand or common generator + // --- 0c. Create curand, hiprand or common generator const std::string cgenKey = "0c GenCreat"; timermap.start( cgenKey ); // Allocate the appropriate RandomNumberKernel @@ -433,7 +481,7 @@ main( int argc, char** argv ) prnk.reset( new CurandRandomNumberKernel( hstRndmom, onDevice ) ); #endif } - else + else if( rndgen == RandomNumberMode::CurandDevice ) { #ifdef MGONGPU_HAS_NO_CURAND throw std::runtime_error( "INTERNAL ERROR! CurandDevice is not supported because this application was built without Curand support" ); // INTERNAL ERROR (no path to this statement) @@ -441,9 +489,31 @@ main( int argc, char** argv ) const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); #else - throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif } + else if( rndgen == RandomNumberMode::HiprandHost ) + { +#ifdef MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "INTERNAL ERROR! HiprandHost is not supported because this application was built without Hiprand support" ); // INTERNAL ERROR (no path to this statement) +#else + const bool onDevice = false; + prnk.reset( new HiprandRandomNumberKernel( hstRndmom, onDevice ) ); +#endif + } + else if( rndgen == RandomNumberMode::HiprandDevice ) + { +#ifdef MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "INTERNAL ERROR! HiprandDevice is not supported because this application was built without Hiprand support" ); // INTERNAL ERROR (no path to this statement) +#elif defined __HIPCC__ + const bool onDevice = true; + prnk.reset( new HiprandRandomNumberKernel( devRndmom, onDevice ) ); +#else + throw std::logic_error( "INTERNAL ERROR! HiprandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) +#endif + } + else + throw std::logic_error( "INTERNAL ERROR! Unknown rndgen value?" ); // INTERNAL ERROR (no path to this statement) // --- 0c. Create rambo sampling kernel [keep this in 0c for the moment] std::unique_ptr prsk; @@ -497,7 +567,7 @@ main( int argc, char** argv ) // *** START THE OLD-STYLE TIMER FOR RANDOM GEN *** double genrtime = 0; - // --- 1a. Seed rnd generator (to get same results on host and device in curand) + // --- 1a. Seed rnd generator (to get same results on host and device in curand/hiprand) // [NB This should not be necessary using the host API: "Generation functions // can be called multiple times on the same generator to generate successive // blocks of results. For pseudorandom generators, multiple calls to generation @@ -515,7 +585,9 @@ main( int argc, char** argv ) //std::cout << "Got random numbers" << std::endl; #ifdef MGONGPUCPP_GPUIMPL - if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) + if( rndgen != RandomNumberMode::CurandDevice && + rndgen != RandomNumberMode::HiprandDevice && + rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device const std::string htodKey = "1c CpHTDrnd"; @@ -761,6 +833,10 @@ main( int argc, char** argv ) rndgentxt = "CURAND HOST"; else if( rndgen == RandomNumberMode::CurandDevice ) rndgentxt = "CURAND DEVICE"; + else if( rndgen == RandomNumberMode::HiprandHost ) + rndgentxt = "ROCRAND HOST"; + else if( rndgen == RandomNumberMode::HiprandDevice ) + rndgentxt = "ROCRAND DEVICE"; #ifdef __CUDACC__ rndgentxt += " (CUDA code)"; #elif defined __HIPCC__ @@ -788,7 +864,7 @@ main( int argc, char** argv ) wrkflwtxt += "FLT+"; #else wrkflwtxt += "???+"; // no path to this statement -#endif /* clang-format on */ +#endif // -- CUCOMPLEX or THRUST or STD or CXSIMPLE complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -813,7 +889,7 @@ main( int argc, char** argv ) wrkflwtxt += "CXS:"; #else wrkflwtxt += "???:"; // no path to this statement -#endif +#endif /* clang-format on */ #endif // -- COMMON or CURAND HOST or CURAND DEVICE random numbers? if( rndgen == RandomNumberMode::CommonRandom ) @@ -822,6 +898,10 @@ main( int argc, char** argv ) wrkflwtxt += "CURHST+"; else if( rndgen == RandomNumberMode::CurandDevice ) wrkflwtxt += "CURDEV+"; + else if( rndgen == RandomNumberMode::HiprandHost ) + wrkflwtxt += "HIRHST+"; + else if( rndgen == RandomNumberMode::HiprandDevice ) + wrkflwtxt += "HIRDEV+"; else wrkflwtxt += "??????+"; // no path to this statement // -- HOST or DEVICE rambo sampling? @@ -1099,7 +1179,7 @@ main( int argc, char** argv ) #ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif - << "\"Curand generation\": " + << "\"Random generation\": " << "\"" << rndgentxt << "\"," << std::endl; double minelem = hstStats.minME; diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/RandomNumberKernels.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/RandomNumberKernels.h index 21d63beeac..7ed728a26c 100644 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/RandomNumberKernels.h +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/RandomNumberKernels.h @@ -1,21 +1,22 @@ -// Copyright (C) 2020-2023 CERN and UCLouvain. +// Copyright (C) 2020-2024 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. #ifndef RANDOMNUMBERKERNELS_H #define RANDOMNUMBERKERNELS_H 1 #include "mgOnGpuConfig.h" -// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when MGONGPUCPP_GPUIMPL is not defined -#ifndef MGONGPU_HAS_NO_CURAND -//#include "curand.h" -struct curandGenerator_st; // forward definition from curand.h -#endif - #include "MemoryBuffers.h" +// Forward definition from curand.h (the full header is only needed in CurandRandomKernel.cc) +struct curandGenerator_st; + +// Forward definition from hiprand.h (the full header is only needed in HiprandRandomKernel.cc) +struct rocrand_generator_base_type; +typedef rocrand_generator_base_type hiprandGenerator_st; + #ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else @@ -107,7 +108,6 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef MGONGPU_HAS_NO_CURAND // A class encapsulating CURAND random number generation on a CPU host or on a GPU device class CurandRandomNumberKernel final : public RandomNumberKernelBase { @@ -142,11 +142,49 @@ namespace mg5amcCpu const bool m_isOnDevice; // The curand generator - // (NB: curand.h defines typedef generator_t as a pointer to forward-defined 'struct curandGenerator_st') + // (NB: curand.h defines typedef curandGenerator_t as a pointer to forward-defined 'struct curandGenerator_st') curandGenerator_st* m_rnGen; }; -#endif + //-------------------------------------------------------------------------- + + // A class encapsulating HIPRAND random number generation on a CPU host or on a GPU device + class HiprandRandomNumberKernel final : public RandomNumberKernelBase + { + public: + + // Constructor from an existing output buffer + HiprandRandomNumberKernel( BufferRndNumMomenta& rnarray, const bool onDevice ); + + // Destructor + ~HiprandRandomNumberKernel(); + + // Seed the random number generator + void seedGenerator( const unsigned int seed ) override final; + + // Generate the random number array + void generateRnarray() override final; + + // Is this a host or device kernel? + bool isOnDevice() const override final { return m_isOnDevice; } + + private: + + // Create the generator (workaround for #429: do this in every seedGenerator call rather than only in the ctor) + void createGenerator(); + + // Destroy the generator (workaround for #429: do this in every seedGenerator call rather than only in the ctor) + void destroyGenerator(); + + private: + + // Is this a host or device kernel? + const bool m_isOnDevice; + + // The hiprand generator + // (NB: hiprand.h defines typedef hiprandGenerator_t as a pointer to forward-defined 'struct hiprandGenerator_st') + hiprandGenerator_st* m_rnGen; + }; //-------------------------------------------------------------------------- } diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/cudacpp.mk index 1077bdc098..3ad91dfd59 100644 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/cudacpp.mk @@ -173,11 +173,6 @@ ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) comma:=, CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch)) CUINC = -I$(CUDA_HOME)/include/ - ifeq ($(RNDGEN),hasNoCurand) - CURANDLIBFLAGS= - else - CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here! - endif CUOPTFLAGS = -lineinfo ###GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math GPUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math @@ -241,7 +236,7 @@ else override GPUCC= override USE_NVTX= override CUINC= - override CURANDLIBFLAGS= + override HIPINC= endif @@ -291,7 +286,7 @@ endif #------------------------------------------------------------------------------- -#=== Configure defaults and check if user-defined choices exist for OMPFLAGS, AVX, FPTYPE, HELINL, HRDCOD, RNDGEN +#=== Configure defaults and check if user-defined choices exist for OMPFLAGS, AVX, FPTYPE, HELINL, HRDCOD # Set the default OMPFLAGS choice ifneq ($(findstring hipcc,$(GPUCC)),) @@ -352,29 +347,62 @@ ifeq ($(HRDCOD),) override HRDCOD = 0 endif -# Set the default RNDGEN (random number generator) choice -ifeq ($(RNDGEN),) - ifeq ($(GPUCC),) - override RNDGEN = hasNoCurand - # Edgecase for HIP compilation - else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) - override RNDGEN = hasNoCurand - else ifeq ($(RNDGEN),) - override RNDGEN = hasCurand - endif -endif - -# Export AVX, FPTYPE, HELINL, HRDCOD, RNDGEN, OMPFLAGS so that it is not necessary to pass them to the src Makefile too +# Export AVX, FPTYPE, HELINL, HRDCOD, OMPFLAGS so that it is not necessary to pass them to the src Makefile too export AVX export FPTYPE export HELINL export HRDCOD -export RNDGEN export OMPFLAGS #------------------------------------------------------------------------------- -#=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN +#=== Configure defaults and check if user-defined choices exist for RNDGEN (legacy!), HASCURAND, HASHIPRAND + +# If the legacy RNDGEN exists, this take precedence over any HASCURAND choice (but a warning is printed out) +###$(info RNDGEN=$(RNDGEN)) +ifneq ($(RNDGEN),) + $(warning Environment variable RNDGEN is no longer supported, please use HASCURAND instead!) + ifeq ($(RNDGEN),hasCurand) + override HASCURAND = $(RNDGEN) + else ifeq ($(RNDGEN),hasNoCurand) + override HASCURAND = $(RNDGEN) + else ifneq ($(RNDGEN),hasNoCurand) + $(error Unknown RNDGEN='$(RNDGEN)': only 'hasCurand' and 'hasNoCurand' are supported - but use HASCURAND instead!) + endif +endif + +# Set the default HASCURAND (curand random number generator) choice, if no prior choice exists for HASCURAND +# (NB: allow HASCURAND=hasCurand even if $(GPUCC) does not point to nvcc: assume CUDA_HOME was defined correctly...) +ifeq ($(HASCURAND),) + ifeq ($(GPUCC),) # CPU-only build + override HASCURAND = hasNoCurand + else ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build + override HASCURAND = hasCurand + else # non-Nvidia GPU build + override HASCURAND = hasNoCurand + endif +endif + +# Set the default HASHIPRAND (hiprand random number generator) choice, if no prior choice exists for HASHIPRAND +# (NB: allow HASHIPRAND=hasHiprand even if $(GPUCC) does not point to hipcc: assume HIP_HOME was defined correctly...) +ifeq ($(HASHIPRAND),) + ifeq ($(GPUCC),) # CPU-only build + override HASHIPRAND = hasNoHiprand + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build + override HASHIPRAND = hasHiprand + else # non-AMD GPU build + override HASHIPRAND = hasNoHiprand + endif +endif + +# Export HASCURAND, HASHIPRAND so that it is not necessary to pass them to the src Makefile too +# (NB: these variables in cudacpp_src.mk are only used to define the build tag, they are NOT needed for RNDCXXFLAGS or RNDLIBFLAGS) +export HASCURAND +export HASHIPRAND + +#------------------------------------------------------------------------------- + +#=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD # Set the build flags appropriate to OMPFLAGS $(info OMPFLAGS=$(OMPFLAGS)) @@ -432,13 +460,13 @@ CXXFLAGS+= $(AVXFLAGS) $(info FPTYPE=$(FPTYPE)) ifeq ($(FPTYPE),d) CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE - GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE + GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE else ifeq ($(FPTYPE),f) CXXFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT - GPUFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT + GPUFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT else ifeq ($(FPTYPE),m) CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT - GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT + GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT else $(error Unknown FPTYPE='$(FPTYPE)': only 'd', 'f' and 'm' are supported) endif @@ -447,7 +475,7 @@ endif $(info HELINL=$(HELINL)) ifeq ($(HELINL),1) CXXFLAGS += -DMGONGPU_INLINE_HELAMPS - GPUFLAGS += -DMGONGPU_INLINE_HELAMPS + GPUFLAGS += -DMGONGPU_INLINE_HELAMPS else ifneq ($(HELINL),0) $(error Unknown HELINL='$(HELINL)': only '0' and '1' are supported) endif @@ -456,21 +484,40 @@ endif $(info HRDCOD=$(HRDCOD)) ifeq ($(HRDCOD),1) CXXFLAGS += -DMGONGPU_HARDCODE_PARAM - GPUFLAGS += -DMGONGPU_HARDCODE_PARAM + GPUFLAGS += -DMGONGPU_HARDCODE_PARAM else ifneq ($(HRDCOD),0) $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported) endif -# Set the build flags appropriate to each RNDGEN choice (example: "make RNDGEN=hasNoCurand") -$(info RNDGEN=$(RNDGEN)) -ifeq ($(RNDGEN),hasNoCurand) - override CXXFLAGSCURAND = -DMGONGPU_HAS_NO_CURAND -else ifeq ($(RNDGEN),hasCurand) - override CXXFLAGSCURAND = + +#=== Set the CUDA/HIP/C++ compiler and linker flags appropriate to user-defined choices of HASCURAND, HASHIPRAND + +$(info HASCURAND=$(HASCURAND)) +$(info HASHIPRAND=$(HASHIPRAND)) +override RNDCXXFLAGS= +override RNDLIBFLAGS= + +# Set the RNDCXXFLAGS and RNDLIBFLAGS build flags appropriate to each HASCURAND choice (example: "make HASCURAND=hasNoCurand") +ifeq ($(HASCURAND),hasNoCurand) + override RNDCXXFLAGS += -DMGONGPU_HAS_NO_CURAND +else ifeq ($(HASCURAND),hasCurand) + override RNDLIBFLAGS += -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here! else - $(error Unknown RNDGEN='$(RNDGEN)': only 'hasCurand' and 'hasNoCurand' are supported) + $(error Unknown HASCURAND='$(HASCURAND)': only 'hasCurand' and 'hasNoCurand' are supported) +endif + +# Set the RNDCXXFLAGS and RNDLIBFLAGS build flags appropriate to each HASHIPRAND choice (example: "make HASHIPRAND=hasNoHiprand") +ifeq ($(HASHIPRAND),hasNoHiprand) + override RNDCXXFLAGS += -DMGONGPU_HAS_NO_HIPRAND +else ifeq ($(HASHIPRAND),hasHiprand) + override RNDLIBFLAGS += -L$(HIP_HOME)/lib/ -lhiprand +else ifneq ($(HASHIPRAND),hasHiprand) + $(error Unknown HASHIPRAND='$(HASHIPRAND)': only 'hasHiprand' and 'hasNoHiprand' are supported) endif +#$(info RNDCXXFLAGS=$(RNDCXXFLAGS)) +#$(info HASHIPRAND=$(HASHIPRAND)) + #------------------------------------------------------------------------------- #=== Configure build directories and build lockfiles === @@ -481,7 +528,7 @@ override DIRTAG = $(AVX)_$(FPTYPE)_inl$(HELINL)_hrd$(HRDCOD) # Build lockfile "full" tag (defines full specification of build options that cannot be intermixed) # (Rationale: avoid mixing of CUDA and no-CUDA environment builds with different random number generators) -override TAG = $(AVX)_$(FPTYPE)_inl$(HELINL)_hrd$(HRDCOD)_$(RNDGEN) +override TAG = $(AVX)_$(FPTYPE)_inl$(HELINL)_hrd$(HRDCOD)_$(HASCURAND)_$(HASHIPRAND) # Build directory: current directory by default, or build.$(DIRTAG) if USEBUILDDIR==1 ifeq ($(USEBUILDDIR),1) @@ -589,14 +636,19 @@ endif $(BUILDDIR)/check_sa.o: CXXFLAGS += $(USE_NVTX) $(CUINC) $(BUILDDIR)/check_sa_cu.o: CXXFLAGS += $(USE_NVTX) $(CUINC) -# Apply special build flags only to check_sa[_cu].o and CurandRandomNumberKernel[_cu].o (curand headers, #679) -$(BUILDDIR)/check_sa.o: CXXFLAGS += $(CXXFLAGSCURAND) -$(BUILDDIR)/check_sa_cu.o: CUFLAGS += $(CXXFLAGSCURAND) -$(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CXXFLAGSCURAND) -$(BUILDDIR)/CurandRandomNumberKernel_cu.o: CUFLAGS += $(CXXFLAGSCURAND) -ifeq ($(RNDGEN),hasCurand) +# Apply special build flags only to check_sa[_cu].o and (Cu|Hip)randRandomNumberKernel[_cu].o +$(BUILDDIR)/check_sa.o: CXXFLAGS += $(RNDCXXFLAGS) +$(BUILDDIR)/check_sa_cu.o: CUFLAGS += $(RNDCXXFLAGS) +$(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(RNDCXXFLAGS) +$(BUILDDIR)/CurandRandomNumberKernel_cu.o: CUFLAGS += $(RNDCXXFLAGS) +$(BUILDDIR)/HiprandRandomNumberKernel.o: CXXFLAGS += $(RNDCXXFLAGS) +$(BUILDDIR)/HiprandRandomNumberKernel_cu.o: CUFLAGS += $(RNDCXXFLAGS) +ifeq ($(HASCURAND),hasCurand) # curand headers, #679 $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUINC) endif +ifeq ($(HASHIPRAND),hasHiprand) # hiprand headers +$(BUILDDIR)/HiprandRandomNumberKernel.o: CXXFLAGS += $(HIPINC) +endif # Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592) ifneq ($(shell $(CXX) --version | egrep '^(Intel)'),) @@ -673,8 +725,8 @@ endif # Target (and build rules): C++ and CUDA standalone executables $(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o - $(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CURANDLIBFLAGS) +$(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(BUILDDIR)/HiprandRandomNumberKernel.o + $(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(BUILDDIR)/HiprandRandomNumberKernel.o $(RNDLIBFLAGS) ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) @@ -684,8 +736,8 @@ else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(cu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif $(cu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(cu_main): $(BUILDDIR)/check_sa_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_cu.o - $(GPUCC) -o $@ $(BUILDDIR)/check_sa_cu.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_cu.o $(CURANDLIBFLAGS) +$(cu_main): $(BUILDDIR)/check_sa_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_cu.o $(BUILDDIR)/HiprandRandomNumberKernel_cu.o + $(GPUCC) -o $@ $(BUILDDIR)/check_sa_cu.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_cu.o $(BUILDDIR)/HiprandRandomNumberKernel_cu.o $(RNDLIBFLAGS) endif #------------------------------------------------------------------------------- diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/runTest.cc b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/runTest.cc index de327f2321..7f248d29a4 100644 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/runTest.cc +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/runTest.cc @@ -238,7 +238,7 @@ struct CUDATest : public CUDA_CPU_TestBase return MemoryAccessMatrixElements::ieventAccessConst( hstMatrixElements.data(), ievt ); } }; -#endif +#endif /* clang-format off */ // Use two levels of macros to force stringification at the right level // (see https://gcc.gnu.org/onlinedocs/gcc-3.0.1/cpp_3.html#SEC17 and https://stackoverflow.com/a/3419392) @@ -260,4 +260,4 @@ INSTANTIATE_TEST_SUITE_P( prefix, \ MG_INSTANTIATE_TEST_SUITE_GPU( XTESTID_GPU( MG_EPOCH_PROCESS_ID ), MadgraphTest ); #else MG_INSTANTIATE_TEST_SUITE_CPU( XTESTID_CPU( MG_EPOCH_PROCESS_ID ), MadgraphTest ); -#endif +#endif /* clang-format on */ diff --git a/epochX/cudacpp/heft_gg_h.sa/src/cudacpp_src.mk b/epochX/cudacpp/heft_gg_h.sa/src/cudacpp_src.mk index fb8da8830b..fbdc922c93 100644 --- a/epochX/cudacpp/heft_gg_h.sa/src/cudacpp_src.mk +++ b/epochX/cudacpp/heft_gg_h.sa/src/cudacpp_src.mk @@ -45,13 +45,13 @@ endif #------------------------------------------------------------------------------- -#=== Configure the CUDA compiler (note: GPUCC is already exported including ccache) +#=== Configure the CUDA compiler (note: GPUCC have been exported from cudacpp.mk including ccache) ###$(info GPUCC=$(GPUCC)) #------------------------------------------------------------------------------- -#=== Configure ccache for C++ builds (note: GPUCC is already exported including ccache) +#=== Configure ccache for C++ builds (note: GPUCC have been exported from cudacpp.mk including ccache) # Enable ccache if USECCACHE=1 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) @@ -86,7 +86,8 @@ endif #------------------------------------------------------------------------------- -#=== Set the CUDA/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN +#=== Set the CUDA/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD (exported from cudacpp.mk) +#=== (NB the RNDCXXFLAGS and RNDLIBFLAGS appropriate to user-defined choices of HASCURAND and HASHIPRAND have been exported from cudacpp.mk) # Set the build flags appropriate to OMPFLAGS ###$(info OMPFLAGS=$(OMPFLAGS)) @@ -175,14 +176,6 @@ else ifneq ($(HRDCOD),0) $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported) endif -# Set the build flags appropriate to each RNDGEN choice (example: "make RNDGEN=hasNoCurand") -###$(info RNDGEN=$(RNDGEN)) -ifeq ($(RNDGEN),hasNoCurand) - CXXFLAGS += -DMGONGPU_HAS_NO_CURAND -else ifneq ($(RNDGEN),hasCurand) - $(error Unknown RNDGEN='$(RNDGEN)': only 'hasCurand' and 'hasNoCurand' are supported) -endif - #------------------------------------------------------------------------------- #=== Configure build directories and build lockfiles === @@ -193,7 +186,7 @@ override DIRTAG = $(AVX)_$(FPTYPE)_inl$(HELINL)_hrd$(HRDCOD) # Build lockfile "full" tag (defines full specification of build options that cannot be intermixed) # (Rationale: avoid mixing of CUDA and no-CUDA environment builds with different random number generators) -override TAG = $(AVX)_$(FPTYPE)_inl$(HELINL)_hrd$(HRDCOD)_$(RNDGEN) +override TAG = $(AVX)_$(FPTYPE)_inl$(HELINL)_hrd$(HRDCOD)_$(HASCURAND)_$(HASHIPRAND) # Build directory: current directory by default, or build.$(DIRTAG) if USEBUILDDIR==1 ###$(info Current directory is $(shell pwd)) diff --git a/epochX/cudacpp/heft_gg_h.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/heft_gg_h.sa/src/mgOnGpuConfig.h index 475749ca7c..22d6921fda 100644 --- a/epochX/cudacpp/heft_gg_h.sa/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/heft_gg_h.sa/src/mgOnGpuConfig.h @@ -20,11 +20,15 @@ #undef MGONGPUCPP_GPUIMPL #endif +// Make sure that __HIP_PLATFORM_NVIDIA__ is undefined +// (__HIP_PLATFORM_AMD__ is defined by hipcc or in HiprandRandomNumberKernel.cc) +#undef __HIP_PLATFORM_NVIDIA__ // disable hiprand for NVidia (curand) + // ** NB1 Throughputs (e.g. 6.8E8) are events/sec for "./gcheck.exe -p 65536 128 12" // ** NB2 Baseline on b7g47n0004 fluctuates (probably depends on load on other VMs) // Choose if curand is supported for generating random numbers -// For HIP, by default, do not use curand (common random numbers will be used instead) +// For HIP, by default, do not allow curand to be used (hiprand or common random numbers will be used instead) // For both CUDA and C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND // (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784 and #785) #if defined __HIPCC__ @@ -39,6 +43,22 @@ //#endif #endif +// Choose if hiprand is supported for generating random numbers +// For CUDA, by default, do not allow hiprand to be used (curand or common random numbers will be used instead) +// For both HIP and C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_HIPRAND +// (there may exist HIP installations which do not include hiprand?) +#if defined __CUDACC__ +#define MGONGPU_HAS_NO_HIPRAND 1 +#else +//#ifdef __HIPCC__ +//#undef MGONGPU_HAS_NO_HIPRAND // default +////#define MGONGPU_HAS_NO_HIPRAND 1 +//#else +//#undef MGONGPU_HAS_NO_HIPRAND // default +////#define MGONGPU_HAS_NO_HIPRAND 1 +//#endif +#endif + // Choose floating point precision (for everything but color algebra #537) // If one of these macros has been set from outside with e.g. -DMGONGPU_FPTYPE_FLOAT, nothing happens (issue #167) #if not defined MGONGPU_FPTYPE_DOUBLE and not defined MGONGPU_FPTYPE_FLOAT diff --git a/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt b/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt index b13bc090f3..4b983ad8d3 100644 --- a/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt +++ b/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt @@ -61,7 +61,7 @@ set zerowidth_tchannel F define j = p INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005463123321533203  +DEBUG: model prefixing takes 0.005466461181640625  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -172,7 +172,7 @@ INFO: Process u~ u > t t~ added to mirror process u u~ > t t~ INFO: Process c~ c > t t~ added to mirror process c c~ > t t~ INFO: Process d~ d > t t~ added to mirror process d d~ > t t~ INFO: Process s~ s > t t~ added to mirror process s s~ > t t~ -5 processes with 7 diagrams generated in 0.030 s +5 processes with 7 diagrams generated in 0.029 s Total: 5 processes with 7 diagrams add process p p > t t~ j @1 INFO: Checking for minimal orders which gives processes. @@ -212,7 +212,7 @@ INFO: Process d~ g > t t~ d~ added to mirror process g d~ > t t~ d~ INFO: Process d~ d > t t~ g added to mirror process d d~ > t t~ g INFO: Process s~ g > t t~ s~ added to mirror process g s~ > t t~ s~ INFO: Process s~ s > t t~ g added to mirror process s s~ > t t~ g -13 processes with 76 diagrams generated in 0.143 s +13 processes with 76 diagrams generated in 0.135 s Total: 18 processes with 83 diagrams add process p p > t t~ j j @2 INFO: Checking for minimal orders which gives processes. @@ -378,7 +378,7 @@ INFO: Process s~ u~ > t t~ u~ s~ added to mirror process u~ s~ > t t~ u~ s~ INFO: Process s~ c~ > t t~ c~ s~ added to mirror process c~ s~ > t t~ c~ s~ INFO: Process s~ d~ > t t~ d~ s~ added to mirror process d~ s~ > t t~ d~ s~ INFO: Crossed process found for s~ s~ > t t~ s~ s~, reuse diagrams. -65 processes with 1119 diagrams generated in 1.876 s +65 processes with 1119 diagrams generated in 1.826 s Total: 83 processes with 1202 diagrams output madevent ../TMPOUT/CODEGEN_mad_pp_tt012j --hel_recycling=False --vector_size=32 --me_exporter=standalone_cudacpp Load PLUGIN.CUDACPP_OUTPUT @@ -387,10 +387,10 @@ It has been validated for the last time with version: 3.5.2 Addition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT Output will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: cformat =  standalone_cudacpp [export_cpp.py at line 3071]  -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 160]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 161]  INFO: initialize a new directory: CODEGEN_mad_pp_tt012j INFO: remove old information in CODEGEN_mad_pp_tt012j -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 165]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 166]  WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j  INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards  @@ -499,7 +499,7 @@ INFO: Combined process d d~ > t t~ WEIGHTED<=2 with process u u~ > t t~ WEIGHTED INFO: Combined process s s~ > t t~ WEIGHTED<=2 with process u u~ > t t~ WEIGHTED<=2 INFO: Creating files in directory P2_gg_ttxgg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1057]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -516,7 +516,7 @@ INFO: Generating Feynman diagrams for Process: g g > t t~ g g WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group gg_ttxgg INFO: Creating files in directory P2_gg_ttxuux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1057]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -533,7 +533,7 @@ INFO: Generating Feynman diagrams for Process: g g > t t~ u u~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group gg_ttxuux INFO: Creating files in directory P2_gu_ttxgu DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1057]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -550,7 +550,7 @@ INFO: Generating Feynman diagrams for Process: g u > t t~ g u WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group gu_ttxgu INFO: Creating files in directory P2_gux_ttxgux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1057]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -567,7 +567,7 @@ INFO: Generating Feynman diagrams for Process: g u~ > t t~ g u~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group gux_ttxgux INFO: Creating files in directory P2_uux_ttxgg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1057]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -584,7 +584,7 @@ INFO: Generating Feynman diagrams for Process: u u~ > t t~ g g WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uux_ttxgg INFO: Creating files in directory P1_gg_ttxg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1057]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -601,7 +601,7 @@ INFO: Generating Feynman diagrams for Process: g g > t t~ g WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttxg INFO: Creating files in directory P2_uu_ttxuu DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1057]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -618,7 +618,7 @@ INFO: Generating Feynman diagrams for Process: u u > t t~ u u WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uu_ttxuu INFO: Creating files in directory P2_uux_ttxuux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1057]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -635,7 +635,7 @@ INFO: Generating Feynman diagrams for Process: u u~ > t t~ u u~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uux_ttxuux INFO: Creating files in directory P2_uxux_ttxuxux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1057]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -652,7 +652,7 @@ INFO: Generating Feynman diagrams for Process: u~ u~ > t t~ u~ u~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uxux_ttxuxux INFO: Creating files in directory P2_uc_ttxuc DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1057]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -669,7 +669,7 @@ INFO: Generating Feynman diagrams for Process: u c > t t~ u c WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uc_ttxuc INFO: Creating files in directory P2_uux_ttxccx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1057]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -686,7 +686,7 @@ INFO: Generating Feynman diagrams for Process: u u~ > t t~ c c~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uux_ttxccx INFO: Creating files in directory P2_ucx_ttxucx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1057]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -703,7 +703,7 @@ INFO: Generating Feynman diagrams for Process: u c~ > t t~ u c~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group ucx_ttxucx INFO: Creating files in directory P2_uxcx_ttxuxcx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1057]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -720,7 +720,7 @@ INFO: Generating Feynman diagrams for Process: u~ c~ > t t~ u~ c~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uxcx_ttxuxcx INFO: Creating files in directory P1_gu_ttxu DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1057]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -737,7 +737,7 @@ INFO: Generating Feynman diagrams for Process: g u > t t~ u WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gu_ttxu INFO: Creating files in directory P1_gux_ttxux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1057]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -754,7 +754,7 @@ INFO: Generating Feynman diagrams for Process: g u~ > t t~ u~ WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gux_ttxux INFO: Creating files in directory P1_uux_ttxg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1057]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -771,7 +771,7 @@ INFO: Generating Feynman diagrams for Process: u u~ > t t~ g WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group uux_ttxg INFO: Creating files in directory P0_gg_ttx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1057]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -788,7 +788,7 @@ INFO: Generating Feynman diagrams for Process: g g > t t~ WEIGHTED<=2 INFO: Finding symmetric diagrams for subprocess group gg_ttx INFO: Creating files in directory P0_uux_ttx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1057]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -803,23 +803,23 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1871]  INFO: Generating Feynman diagrams for Process: u u~ > t t~ WEIGHTED<=2 INFO: Finding symmetric diagrams for subprocess group uux_ttx -Generated helas calls for 18 subprocesses (372 diagrams) in 1.315 s -Wrote files for 810 helas calls in 3.355 s +Generated helas calls for 18 subprocesses (372 diagrams) in 1.280 s +Wrote files for 810 helas calls in 3.230 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.343 s -DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202]  +ALOHA: aloha creates 5 routines in 0.333 s +DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 203]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 10 routines in 0.321 s +ALOHA: aloha creates 10 routines in 0.309 s VVV1 VVV1 FFV1 @@ -1023,16 +1023,16 @@ Hunk #2 succeeded at 194 (offset 51 lines). Hunk #3 succeeded at 272 (offset 51 lines). Hunk #4 succeeded at 300 (offset 51 lines). Hunk #5 succeeded at 345 (offset 51 lines). -DEBUG: p.returncode =  0 [output.py at line 237]  +DEBUG: p.returncode =  0 [output.py at line 238]  Output to directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j done. Type "launch" to generate events from this process, or see /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/README Run "open index.html" to see more information about this process. quit -real 0m9.107s -user 0m8.555s -sys 0m0.517s +real 0m8.828s +user 0m8.276s +sys 0m0.507s Code generation completed in 9 seconds ************************************************************ * * diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/CurandRandomNumberKernel.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/CurandRandomNumberKernel.cc index 08a16f6f2c..c160c5e06b 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/CurandRandomNumberKernel.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/CurandRandomNumberKernel.cc @@ -10,6 +10,7 @@ #include #ifndef MGONGPU_HAS_NO_CURAND /* clang-format off */ +// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when MGONGPUCPP_GPUIMPL is not defined #include "curand.h" #define checkCurand( code ){ assertCurand( code, __FILE__, __LINE__ ); } inline void assertCurand( curandStatus_t code, const char *file, int line, bool abort = true ) diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/HiprandRandomNumberKernel.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/HiprandRandomNumberKernel.cc new file mode 100644 index 0000000000..2e4534f9d4 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/HiprandRandomNumberKernel.cc @@ -0,0 +1,145 @@ +// Copyright (C) 2020-2024 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Jan 2024) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2024) for the MG5aMC CUDACPP plugin. + +#include "mgOnGpuConfig.h" + +#include "GpuRuntime.h" +#include "MemoryBuffers.h" +#include "RandomNumberKernels.h" + +#include + +#ifndef MGONGPU_HAS_NO_HIPRAND /* clang-format off */ +#ifndef __HIP_PLATFORM_AMD__ +#define __HIP_PLATFORM_AMD__ 1 // enable hiprand for AMD (rocrand) +#endif +#include +#define checkHiprand( code ){ assertHiprand( code, __FILE__, __LINE__ ); } +inline void assertHiprand( hiprandStatus_t code, const char *file, int line, bool abort = true ) +{ + if ( code != HIPRAND_STATUS_SUCCESS ) + { + printf( "HiprandAssert: %s:%d code=%d\n", file, line, code ); + if ( abort ) assert( code == HIPRAND_STATUS_SUCCESS ); + } +} +#endif /* clang-format on */ + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + //-------------------------------------------------------------------------- +#ifndef MGONGPU_HAS_NO_HIPRAND + HiprandRandomNumberKernel::HiprandRandomNumberKernel( BufferRndNumMomenta& rnarray, const bool onDevice ) + : RandomNumberKernelBase( rnarray ) + , m_isOnDevice( onDevice ) + { + if( m_isOnDevice ) + { +#ifdef MGONGPUCPP_GPUIMPL + if( !m_rnarray.isOnDevice() ) + throw std::runtime_error( "HiprandRandomNumberKernel on device with a host random number array" ); +#else + throw std::runtime_error( "HiprandRandomNumberKernel does not support HiprandDevice on CPU host" ); +#endif + } + else + { + if( m_rnarray.isOnDevice() ) + throw std::runtime_error( "HiprandRandomNumberKernel on host with a device random number array" ); + } + createGenerator(); + } + + //-------------------------------------------------------------------------- + + HiprandRandomNumberKernel::~HiprandRandomNumberKernel() + { + destroyGenerator(); + } + + //-------------------------------------------------------------------------- + + void HiprandRandomNumberKernel::seedGenerator( const unsigned int seed ) + { + if( m_isOnDevice ) + { + destroyGenerator(); // workaround for #429 + createGenerator(); // workaround for #429 + } + //printf( "seedGenerator: seed %d\n", seed ); + checkHiprand( hiprandSetPseudoRandomGeneratorSeed( m_rnGen, seed ) ); + } + + //-------------------------------------------------------------------------- + + void HiprandRandomNumberKernel::createGenerator() + { + //const hiprandRngType_t type = HIPRAND_RNG_PSEUDO_DEFAULT; + //const hiprandRngType_t type = HIPRAND_RNG_PSEUDO_XORWOW; + //const hiprandRngType_t type = HIPRAND_RNG_PSEUDO_MRG32K3A; + const hiprandRngType_t type = HIPRAND_RNG_PSEUDO_MTGP32; // same as curand; not implemented yet (code=1000) in host code + //const hiprandRngType_t type = HIPRAND_RNG_PSEUDO_MT19937; + //const hiprandRngType_t type = HIPRAND_RNG_PSEUDO_PHILOX4_32_10; + if( m_isOnDevice ) + { + checkHiprand( hiprandCreateGenerator( &m_rnGen, type ) ); + } + else + { + // See https://github.com/ROCm/hipRAND/issues/76 + throw std::runtime_error( "HiprandRandomNumberKernel on host is not supported yet (hiprandCreateGeneratorHost is not implemented yet)" ); + //checkHiprand( hiprandCreateGeneratorHost( &m_rnGen, type ) ); // ALWAYS FAILS WITH CODE=1000 + } + // FIXME: hiprand ordering is not implemented yet + // See https://github.com/ROCm/hipRAND/issues/75 + /* + //checkHiprand( hiprandSetGeneratorOrdering( *&m_rnGen, HIPRAND_ORDERING_PSEUDO_LEGACY ) ); + checkHiprand( hiprandSetGeneratorOrdering( *&m_rnGen, HIPRAND_ORDERING_PSEUDO_BEST ) ); + //checkHiprand( hiprandSetGeneratorOrdering( *&m_rnGen, HIPRAND_ORDERING_PSEUDO_DYNAMIC ) ); + //checkHiprand( hiprandSetGeneratorOrdering( *&m_rnGen, HIPRAND_ORDERING_PSEUDO_SEEDED ) ); + */ + } + + //-------------------------------------------------------------------------- + + void HiprandRandomNumberKernel::destroyGenerator() + { + checkHiprand( hiprandDestroyGenerator( m_rnGen ) ); + } + + //-------------------------------------------------------------------------- + + void HiprandRandomNumberKernel::generateRnarray() + { +#if defined MGONGPU_FPTYPE_DOUBLE + checkHiprand( hiprandGenerateUniformDouble( m_rnGen, m_rnarray.data(), m_rnarray.size() ) ); +#elif defined MGONGPU_FPTYPE_FLOAT + checkHiprand( hiprandGenerateUniform( m_rnGen, m_rnarray.data(), m_rnarray.size() ) ); +#endif + /* + printf( "\nHiprandRandomNumberKernel::generateRnarray size = %d\n", (int)m_rnarray.size() ); + fptype* data = m_rnarray.data(); +#ifdef MGONGPUCPP_GPUIMPL + if( m_rnarray.isOnDevice() ) + { + data = new fptype[m_rnarray.size()](); + checkCuda( cudaMemcpy( data, m_rnarray.data(), m_rnarray.bytes(), cudaMemcpyDeviceToHost ) ); + } +#endif + for( int i = 0; i < ( (int)m_rnarray.size() / 4 ); i++ ) + printf( "[%4d] %f %f %f %f\n", i * 4, data[i * 4], data[i * 4 + 2], data[i * 4 + 2], data[i * 4 + 3] ); +#ifdef MGONGPUCPP_GPUIMPL + if( m_rnarray.isOnDevice() ) delete[] data; +#endif + */ + } + + //-------------------------------------------------------------------------- +#endif +} diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/HiprandRandomNumberKernel.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/HiprandRandomNumberKernel.cc new file mode 120000 index 0000000000..6691864f78 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/HiprandRandomNumberKernel.cc @@ -0,0 +1 @@ +../HiprandRandomNumberKernel.cc \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/check_sa.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/check_sa.cc index bde384c69e..e086ae12d9 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/check_sa.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/check_sa.cc @@ -1,10 +1,10 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2023 CERN and UCLouvain. +// Copyright (C) 2020-2024 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -58,7 +58,7 @@ int usage( char* argv0, int ret = 1 ) { std::cout << "Usage: " << argv0 - << " [--verbose|-v] [--debug|-d] [--performance|-p] [--json|-j] [--curhst|--curdev|--common] [--rmbhst|--rmbdev] [--bridge]" + << " [--verbose|-v] [--debug|-d] [--performance|-p] [--json|-j] [--curhst|--curdev|--hirhst|--hirdev|--common] [--rmbhst|--rmbdev] [--bridge]" << " [#gpuBlocksPerGrid #gpuThreadsPerBlock] #iterations" << std::endl; std::cout << std::endl; std::cout << "The number of events per iteration is #gpuBlocksPerGrid * #gpuThreadsPerBlock" << std::endl; @@ -131,17 +131,31 @@ main( int argc, char** argv ) enum class RandomNumberMode { CommonRandom = 0, - CurandHost = 1, - CurandDevice = 2 + CurandHost = -1, + CurandDevice = 1, + HiprandHost = -2, + HiprandDevice = 2 }; -#ifdef MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785) -#elif defined __HIPCC__ -#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random -#elif defined __CUDACC__ +#if defined __CUDACC__ +#ifndef MGONGPU_HAS_NO_CURAND RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand #else + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on NVidia GPU if build has no curand (PR #784 and #785) +#endif +#elif defined __HIPCC__ +#ifndef MGONGPU_HAS_NO_HIPRAND + RandomNumberMode rndgen = RandomNumberMode::HiprandDevice; // default on AMD GPU if build has hiprand +#else + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on AMD GPU if build has no hiprand +#endif +#else +#ifndef MGONGPU_HAS_NO_CURAND RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand +#elif not defined MGONGPU_HAS_NO_HIPRAND + RandomNumberMode rndgen = RandomNumberMode::HiprandDevice; // default on CPU if build has hiprand +#else + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has neither curand nor hiprand +#endif #endif // Rambo sampling mode (NB RamboHost implies CommonRandom or CurandHost!) enum class RamboSamplingMode @@ -152,7 +166,7 @@ main( int argc, char** argv ) #ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -193,6 +207,26 @@ main( int argc, char** argv ) throw std::runtime_error( "CurandHost is not supported because this application was built without Curand support" ); #else rndgen = RandomNumberMode::CurandHost; +#endif + } + else if( arg == "--hirdev" ) + { +#ifndef __HIPCC__ + throw std::runtime_error( "HiprandDevice is not supported on CPUs or non-AMD GPUs" ); +#elif defined MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "HiprandDevice is not supported because this application was built without Hiprand support" ); +#else + rndgen = RandomNumberMode::HiprandDevice; +#endif + } + else if( arg == "--hirhst" ) + { +#ifdef MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "HiprandHost is not supported because this application was built without Hiprand support" ); +#else + // See https://github.com/ROCm/hipRAND/issues/76 + throw std::runtime_error( "HiprandRandomNumberKernel on host is not supported yet (hiprandCreateGeneratorHost is not implemented yet)" ); + //rndgen = RandomNumberMode::HiprandHost; #endif } else if( arg == "--common" ) @@ -265,6 +299,20 @@ main( int argc, char** argv ) #endif } + if( rmbsmp == RamboSamplingMode::RamboHost && rndgen == RandomNumberMode::HiprandDevice ) + { +#if not defined MGONGPU_HAS_NO_HIPRAND + // See https://github.com/ROCm/hipRAND/issues/76 + //std::cout << "WARNING! RamboHost selected: cannot use HiprandDevice, will use HiprandHost" << std::endl; + //rndgen = RandomNumberMode::HiprandHost; + std::cout << "WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet)" << std::endl; + rndgen = RandomNumberMode::CommonRandom; +#else + std::cout << "WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom" << std::endl; + rndgen = RandomNumberMode::CommonRandom; +#endif + } + constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout constexpr int neppR = MemoryAccessRandomNumbers::neppR; // AOSOA layout @@ -415,7 +463,7 @@ main( int argc, char** argv ) std::unique_ptr wavetimes( new double[niter] ); std::unique_ptr wv3atimes( new double[niter] ); - // --- 0c. Create curand or common generator + // --- 0c. Create curand, hiprand or common generator const std::string cgenKey = "0c GenCreat"; timermap.start( cgenKey ); // Allocate the appropriate RandomNumberKernel @@ -433,7 +481,7 @@ main( int argc, char** argv ) prnk.reset( new CurandRandomNumberKernel( hstRndmom, onDevice ) ); #endif } - else + else if( rndgen == RandomNumberMode::CurandDevice ) { #ifdef MGONGPU_HAS_NO_CURAND throw std::runtime_error( "INTERNAL ERROR! CurandDevice is not supported because this application was built without Curand support" ); // INTERNAL ERROR (no path to this statement) @@ -441,9 +489,31 @@ main( int argc, char** argv ) const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); #else - throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif } + else if( rndgen == RandomNumberMode::HiprandHost ) + { +#ifdef MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "INTERNAL ERROR! HiprandHost is not supported because this application was built without Hiprand support" ); // INTERNAL ERROR (no path to this statement) +#else + const bool onDevice = false; + prnk.reset( new HiprandRandomNumberKernel( hstRndmom, onDevice ) ); +#endif + } + else if( rndgen == RandomNumberMode::HiprandDevice ) + { +#ifdef MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "INTERNAL ERROR! HiprandDevice is not supported because this application was built without Hiprand support" ); // INTERNAL ERROR (no path to this statement) +#elif defined __HIPCC__ + const bool onDevice = true; + prnk.reset( new HiprandRandomNumberKernel( devRndmom, onDevice ) ); +#else + throw std::logic_error( "INTERNAL ERROR! HiprandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) +#endif + } + else + throw std::logic_error( "INTERNAL ERROR! Unknown rndgen value?" ); // INTERNAL ERROR (no path to this statement) // --- 0c. Create rambo sampling kernel [keep this in 0c for the moment] std::unique_ptr prsk; @@ -497,7 +567,7 @@ main( int argc, char** argv ) // *** START THE OLD-STYLE TIMER FOR RANDOM GEN *** double genrtime = 0; - // --- 1a. Seed rnd generator (to get same results on host and device in curand) + // --- 1a. Seed rnd generator (to get same results on host and device in curand/hiprand) // [NB This should not be necessary using the host API: "Generation functions // can be called multiple times on the same generator to generate successive // blocks of results. For pseudorandom generators, multiple calls to generation @@ -515,7 +585,9 @@ main( int argc, char** argv ) //std::cout << "Got random numbers" << std::endl; #ifdef MGONGPUCPP_GPUIMPL - if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) + if( rndgen != RandomNumberMode::CurandDevice && + rndgen != RandomNumberMode::HiprandDevice && + rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device const std::string htodKey = "1c CpHTDrnd"; @@ -761,6 +833,10 @@ main( int argc, char** argv ) rndgentxt = "CURAND HOST"; else if( rndgen == RandomNumberMode::CurandDevice ) rndgentxt = "CURAND DEVICE"; + else if( rndgen == RandomNumberMode::HiprandHost ) + rndgentxt = "ROCRAND HOST"; + else if( rndgen == RandomNumberMode::HiprandDevice ) + rndgentxt = "ROCRAND DEVICE"; #ifdef __CUDACC__ rndgentxt += " (CUDA code)"; #elif defined __HIPCC__ @@ -788,7 +864,7 @@ main( int argc, char** argv ) wrkflwtxt += "FLT+"; #else wrkflwtxt += "???+"; // no path to this statement -#endif /* clang-format on */ +#endif // -- CUCOMPLEX or THRUST or STD or CXSIMPLE complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -813,7 +889,7 @@ main( int argc, char** argv ) wrkflwtxt += "CXS:"; #else wrkflwtxt += "???:"; // no path to this statement -#endif +#endif /* clang-format on */ #endif // -- COMMON or CURAND HOST or CURAND DEVICE random numbers? if( rndgen == RandomNumberMode::CommonRandom ) @@ -822,6 +898,10 @@ main( int argc, char** argv ) wrkflwtxt += "CURHST+"; else if( rndgen == RandomNumberMode::CurandDevice ) wrkflwtxt += "CURDEV+"; + else if( rndgen == RandomNumberMode::HiprandHost ) + wrkflwtxt += "HIRHST+"; + else if( rndgen == RandomNumberMode::HiprandDevice ) + wrkflwtxt += "HIRDEV+"; else wrkflwtxt += "??????+"; // no path to this statement // -- HOST or DEVICE rambo sampling? @@ -1099,7 +1179,7 @@ main( int argc, char** argv ) #ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif - << "\"Curand generation\": " + << "\"Random generation\": " << "\"" << rndgentxt << "\"," << std::endl; double minelem = hstStats.minME; diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/HiprandRandomNumberKernel.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/HiprandRandomNumberKernel.cc new file mode 120000 index 0000000000..6691864f78 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/HiprandRandomNumberKernel.cc @@ -0,0 +1 @@ +../HiprandRandomNumberKernel.cc \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/check_sa.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/check_sa.cc index bde384c69e..e086ae12d9 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/check_sa.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/check_sa.cc @@ -1,10 +1,10 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2023 CERN and UCLouvain. +// Copyright (C) 2020-2024 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -58,7 +58,7 @@ int usage( char* argv0, int ret = 1 ) { std::cout << "Usage: " << argv0 - << " [--verbose|-v] [--debug|-d] [--performance|-p] [--json|-j] [--curhst|--curdev|--common] [--rmbhst|--rmbdev] [--bridge]" + << " [--verbose|-v] [--debug|-d] [--performance|-p] [--json|-j] [--curhst|--curdev|--hirhst|--hirdev|--common] [--rmbhst|--rmbdev] [--bridge]" << " [#gpuBlocksPerGrid #gpuThreadsPerBlock] #iterations" << std::endl; std::cout << std::endl; std::cout << "The number of events per iteration is #gpuBlocksPerGrid * #gpuThreadsPerBlock" << std::endl; @@ -131,17 +131,31 @@ main( int argc, char** argv ) enum class RandomNumberMode { CommonRandom = 0, - CurandHost = 1, - CurandDevice = 2 + CurandHost = -1, + CurandDevice = 1, + HiprandHost = -2, + HiprandDevice = 2 }; -#ifdef MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785) -#elif defined __HIPCC__ -#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random -#elif defined __CUDACC__ +#if defined __CUDACC__ +#ifndef MGONGPU_HAS_NO_CURAND RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand #else + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on NVidia GPU if build has no curand (PR #784 and #785) +#endif +#elif defined __HIPCC__ +#ifndef MGONGPU_HAS_NO_HIPRAND + RandomNumberMode rndgen = RandomNumberMode::HiprandDevice; // default on AMD GPU if build has hiprand +#else + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on AMD GPU if build has no hiprand +#endif +#else +#ifndef MGONGPU_HAS_NO_CURAND RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand +#elif not defined MGONGPU_HAS_NO_HIPRAND + RandomNumberMode rndgen = RandomNumberMode::HiprandDevice; // default on CPU if build has hiprand +#else + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has neither curand nor hiprand +#endif #endif // Rambo sampling mode (NB RamboHost implies CommonRandom or CurandHost!) enum class RamboSamplingMode @@ -152,7 +166,7 @@ main( int argc, char** argv ) #ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -193,6 +207,26 @@ main( int argc, char** argv ) throw std::runtime_error( "CurandHost is not supported because this application was built without Curand support" ); #else rndgen = RandomNumberMode::CurandHost; +#endif + } + else if( arg == "--hirdev" ) + { +#ifndef __HIPCC__ + throw std::runtime_error( "HiprandDevice is not supported on CPUs or non-AMD GPUs" ); +#elif defined MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "HiprandDevice is not supported because this application was built without Hiprand support" ); +#else + rndgen = RandomNumberMode::HiprandDevice; +#endif + } + else if( arg == "--hirhst" ) + { +#ifdef MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "HiprandHost is not supported because this application was built without Hiprand support" ); +#else + // See https://github.com/ROCm/hipRAND/issues/76 + throw std::runtime_error( "HiprandRandomNumberKernel on host is not supported yet (hiprandCreateGeneratorHost is not implemented yet)" ); + //rndgen = RandomNumberMode::HiprandHost; #endif } else if( arg == "--common" ) @@ -265,6 +299,20 @@ main( int argc, char** argv ) #endif } + if( rmbsmp == RamboSamplingMode::RamboHost && rndgen == RandomNumberMode::HiprandDevice ) + { +#if not defined MGONGPU_HAS_NO_HIPRAND + // See https://github.com/ROCm/hipRAND/issues/76 + //std::cout << "WARNING! RamboHost selected: cannot use HiprandDevice, will use HiprandHost" << std::endl; + //rndgen = RandomNumberMode::HiprandHost; + std::cout << "WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet)" << std::endl; + rndgen = RandomNumberMode::CommonRandom; +#else + std::cout << "WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom" << std::endl; + rndgen = RandomNumberMode::CommonRandom; +#endif + } + constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout constexpr int neppR = MemoryAccessRandomNumbers::neppR; // AOSOA layout @@ -415,7 +463,7 @@ main( int argc, char** argv ) std::unique_ptr wavetimes( new double[niter] ); std::unique_ptr wv3atimes( new double[niter] ); - // --- 0c. Create curand or common generator + // --- 0c. Create curand, hiprand or common generator const std::string cgenKey = "0c GenCreat"; timermap.start( cgenKey ); // Allocate the appropriate RandomNumberKernel @@ -433,7 +481,7 @@ main( int argc, char** argv ) prnk.reset( new CurandRandomNumberKernel( hstRndmom, onDevice ) ); #endif } - else + else if( rndgen == RandomNumberMode::CurandDevice ) { #ifdef MGONGPU_HAS_NO_CURAND throw std::runtime_error( "INTERNAL ERROR! CurandDevice is not supported because this application was built without Curand support" ); // INTERNAL ERROR (no path to this statement) @@ -441,9 +489,31 @@ main( int argc, char** argv ) const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); #else - throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif } + else if( rndgen == RandomNumberMode::HiprandHost ) + { +#ifdef MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "INTERNAL ERROR! HiprandHost is not supported because this application was built without Hiprand support" ); // INTERNAL ERROR (no path to this statement) +#else + const bool onDevice = false; + prnk.reset( new HiprandRandomNumberKernel( hstRndmom, onDevice ) ); +#endif + } + else if( rndgen == RandomNumberMode::HiprandDevice ) + { +#ifdef MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "INTERNAL ERROR! HiprandDevice is not supported because this application was built without Hiprand support" ); // INTERNAL ERROR (no path to this statement) +#elif defined __HIPCC__ + const bool onDevice = true; + prnk.reset( new HiprandRandomNumberKernel( devRndmom, onDevice ) ); +#else + throw std::logic_error( "INTERNAL ERROR! HiprandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) +#endif + } + else + throw std::logic_error( "INTERNAL ERROR! Unknown rndgen value?" ); // INTERNAL ERROR (no path to this statement) // --- 0c. Create rambo sampling kernel [keep this in 0c for the moment] std::unique_ptr prsk; @@ -497,7 +567,7 @@ main( int argc, char** argv ) // *** START THE OLD-STYLE TIMER FOR RANDOM GEN *** double genrtime = 0; - // --- 1a. Seed rnd generator (to get same results on host and device in curand) + // --- 1a. Seed rnd generator (to get same results on host and device in curand/hiprand) // [NB This should not be necessary using the host API: "Generation functions // can be called multiple times on the same generator to generate successive // blocks of results. For pseudorandom generators, multiple calls to generation @@ -515,7 +585,9 @@ main( int argc, char** argv ) //std::cout << "Got random numbers" << std::endl; #ifdef MGONGPUCPP_GPUIMPL - if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) + if( rndgen != RandomNumberMode::CurandDevice && + rndgen != RandomNumberMode::HiprandDevice && + rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device const std::string htodKey = "1c CpHTDrnd"; @@ -761,6 +833,10 @@ main( int argc, char** argv ) rndgentxt = "CURAND HOST"; else if( rndgen == RandomNumberMode::CurandDevice ) rndgentxt = "CURAND DEVICE"; + else if( rndgen == RandomNumberMode::HiprandHost ) + rndgentxt = "ROCRAND HOST"; + else if( rndgen == RandomNumberMode::HiprandDevice ) + rndgentxt = "ROCRAND DEVICE"; #ifdef __CUDACC__ rndgentxt += " (CUDA code)"; #elif defined __HIPCC__ @@ -788,7 +864,7 @@ main( int argc, char** argv ) wrkflwtxt += "FLT+"; #else wrkflwtxt += "???+"; // no path to this statement -#endif /* clang-format on */ +#endif // -- CUCOMPLEX or THRUST or STD or CXSIMPLE complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -813,7 +889,7 @@ main( int argc, char** argv ) wrkflwtxt += "CXS:"; #else wrkflwtxt += "???:"; // no path to this statement -#endif +#endif /* clang-format on */ #endif // -- COMMON or CURAND HOST or CURAND DEVICE random numbers? if( rndgen == RandomNumberMode::CommonRandom ) @@ -822,6 +898,10 @@ main( int argc, char** argv ) wrkflwtxt += "CURHST+"; else if( rndgen == RandomNumberMode::CurandDevice ) wrkflwtxt += "CURDEV+"; + else if( rndgen == RandomNumberMode::HiprandHost ) + wrkflwtxt += "HIRHST+"; + else if( rndgen == RandomNumberMode::HiprandDevice ) + wrkflwtxt += "HIRDEV+"; else wrkflwtxt += "??????+"; // no path to this statement // -- HOST or DEVICE rambo sampling? @@ -1099,7 +1179,7 @@ main( int argc, char** argv ) #ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif - << "\"Curand generation\": " + << "\"Random generation\": " << "\"" << rndgentxt << "\"," << std::endl; double minelem = hstStats.minME; diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/HiprandRandomNumberKernel.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/HiprandRandomNumberKernel.cc new file mode 120000 index 0000000000..6691864f78 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/HiprandRandomNumberKernel.cc @@ -0,0 +1 @@ +../HiprandRandomNumberKernel.cc \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/check_sa.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/check_sa.cc index bde384c69e..e086ae12d9 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/check_sa.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/check_sa.cc @@ -1,10 +1,10 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2023 CERN and UCLouvain. +// Copyright (C) 2020-2024 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -58,7 +58,7 @@ int usage( char* argv0, int ret = 1 ) { std::cout << "Usage: " << argv0 - << " [--verbose|-v] [--debug|-d] [--performance|-p] [--json|-j] [--curhst|--curdev|--common] [--rmbhst|--rmbdev] [--bridge]" + << " [--verbose|-v] [--debug|-d] [--performance|-p] [--json|-j] [--curhst|--curdev|--hirhst|--hirdev|--common] [--rmbhst|--rmbdev] [--bridge]" << " [#gpuBlocksPerGrid #gpuThreadsPerBlock] #iterations" << std::endl; std::cout << std::endl; std::cout << "The number of events per iteration is #gpuBlocksPerGrid * #gpuThreadsPerBlock" << std::endl; @@ -131,17 +131,31 @@ main( int argc, char** argv ) enum class RandomNumberMode { CommonRandom = 0, - CurandHost = 1, - CurandDevice = 2 + CurandHost = -1, + CurandDevice = 1, + HiprandHost = -2, + HiprandDevice = 2 }; -#ifdef MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785) -#elif defined __HIPCC__ -#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random -#elif defined __CUDACC__ +#if defined __CUDACC__ +#ifndef MGONGPU_HAS_NO_CURAND RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand #else + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on NVidia GPU if build has no curand (PR #784 and #785) +#endif +#elif defined __HIPCC__ +#ifndef MGONGPU_HAS_NO_HIPRAND + RandomNumberMode rndgen = RandomNumberMode::HiprandDevice; // default on AMD GPU if build has hiprand +#else + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on AMD GPU if build has no hiprand +#endif +#else +#ifndef MGONGPU_HAS_NO_CURAND RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand +#elif not defined MGONGPU_HAS_NO_HIPRAND + RandomNumberMode rndgen = RandomNumberMode::HiprandDevice; // default on CPU if build has hiprand +#else + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has neither curand nor hiprand +#endif #endif // Rambo sampling mode (NB RamboHost implies CommonRandom or CurandHost!) enum class RamboSamplingMode @@ -152,7 +166,7 @@ main( int argc, char** argv ) #ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -193,6 +207,26 @@ main( int argc, char** argv ) throw std::runtime_error( "CurandHost is not supported because this application was built without Curand support" ); #else rndgen = RandomNumberMode::CurandHost; +#endif + } + else if( arg == "--hirdev" ) + { +#ifndef __HIPCC__ + throw std::runtime_error( "HiprandDevice is not supported on CPUs or non-AMD GPUs" ); +#elif defined MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "HiprandDevice is not supported because this application was built without Hiprand support" ); +#else + rndgen = RandomNumberMode::HiprandDevice; +#endif + } + else if( arg == "--hirhst" ) + { +#ifdef MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "HiprandHost is not supported because this application was built without Hiprand support" ); +#else + // See https://github.com/ROCm/hipRAND/issues/76 + throw std::runtime_error( "HiprandRandomNumberKernel on host is not supported yet (hiprandCreateGeneratorHost is not implemented yet)" ); + //rndgen = RandomNumberMode::HiprandHost; #endif } else if( arg == "--common" ) @@ -265,6 +299,20 @@ main( int argc, char** argv ) #endif } + if( rmbsmp == RamboSamplingMode::RamboHost && rndgen == RandomNumberMode::HiprandDevice ) + { +#if not defined MGONGPU_HAS_NO_HIPRAND + // See https://github.com/ROCm/hipRAND/issues/76 + //std::cout << "WARNING! RamboHost selected: cannot use HiprandDevice, will use HiprandHost" << std::endl; + //rndgen = RandomNumberMode::HiprandHost; + std::cout << "WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet)" << std::endl; + rndgen = RandomNumberMode::CommonRandom; +#else + std::cout << "WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom" << std::endl; + rndgen = RandomNumberMode::CommonRandom; +#endif + } + constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout constexpr int neppR = MemoryAccessRandomNumbers::neppR; // AOSOA layout @@ -415,7 +463,7 @@ main( int argc, char** argv ) std::unique_ptr wavetimes( new double[niter] ); std::unique_ptr wv3atimes( new double[niter] ); - // --- 0c. Create curand or common generator + // --- 0c. Create curand, hiprand or common generator const std::string cgenKey = "0c GenCreat"; timermap.start( cgenKey ); // Allocate the appropriate RandomNumberKernel @@ -433,7 +481,7 @@ main( int argc, char** argv ) prnk.reset( new CurandRandomNumberKernel( hstRndmom, onDevice ) ); #endif } - else + else if( rndgen == RandomNumberMode::CurandDevice ) { #ifdef MGONGPU_HAS_NO_CURAND throw std::runtime_error( "INTERNAL ERROR! CurandDevice is not supported because this application was built without Curand support" ); // INTERNAL ERROR (no path to this statement) @@ -441,9 +489,31 @@ main( int argc, char** argv ) const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); #else - throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif } + else if( rndgen == RandomNumberMode::HiprandHost ) + { +#ifdef MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "INTERNAL ERROR! HiprandHost is not supported because this application was built without Hiprand support" ); // INTERNAL ERROR (no path to this statement) +#else + const bool onDevice = false; + prnk.reset( new HiprandRandomNumberKernel( hstRndmom, onDevice ) ); +#endif + } + else if( rndgen == RandomNumberMode::HiprandDevice ) + { +#ifdef MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "INTERNAL ERROR! HiprandDevice is not supported because this application was built without Hiprand support" ); // INTERNAL ERROR (no path to this statement) +#elif defined __HIPCC__ + const bool onDevice = true; + prnk.reset( new HiprandRandomNumberKernel( devRndmom, onDevice ) ); +#else + throw std::logic_error( "INTERNAL ERROR! HiprandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) +#endif + } + else + throw std::logic_error( "INTERNAL ERROR! Unknown rndgen value?" ); // INTERNAL ERROR (no path to this statement) // --- 0c. Create rambo sampling kernel [keep this in 0c for the moment] std::unique_ptr prsk; @@ -497,7 +567,7 @@ main( int argc, char** argv ) // *** START THE OLD-STYLE TIMER FOR RANDOM GEN *** double genrtime = 0; - // --- 1a. Seed rnd generator (to get same results on host and device in curand) + // --- 1a. Seed rnd generator (to get same results on host and device in curand/hiprand) // [NB This should not be necessary using the host API: "Generation functions // can be called multiple times on the same generator to generate successive // blocks of results. For pseudorandom generators, multiple calls to generation @@ -515,7 +585,9 @@ main( int argc, char** argv ) //std::cout << "Got random numbers" << std::endl; #ifdef MGONGPUCPP_GPUIMPL - if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) + if( rndgen != RandomNumberMode::CurandDevice && + rndgen != RandomNumberMode::HiprandDevice && + rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device const std::string htodKey = "1c CpHTDrnd"; @@ -761,6 +833,10 @@ main( int argc, char** argv ) rndgentxt = "CURAND HOST"; else if( rndgen == RandomNumberMode::CurandDevice ) rndgentxt = "CURAND DEVICE"; + else if( rndgen == RandomNumberMode::HiprandHost ) + rndgentxt = "ROCRAND HOST"; + else if( rndgen == RandomNumberMode::HiprandDevice ) + rndgentxt = "ROCRAND DEVICE"; #ifdef __CUDACC__ rndgentxt += " (CUDA code)"; #elif defined __HIPCC__ @@ -788,7 +864,7 @@ main( int argc, char** argv ) wrkflwtxt += "FLT+"; #else wrkflwtxt += "???+"; // no path to this statement -#endif /* clang-format on */ +#endif // -- CUCOMPLEX or THRUST or STD or CXSIMPLE complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -813,7 +889,7 @@ main( int argc, char** argv ) wrkflwtxt += "CXS:"; #else wrkflwtxt += "???:"; // no path to this statement -#endif +#endif /* clang-format on */ #endif // -- COMMON or CURAND HOST or CURAND DEVICE random numbers? if( rndgen == RandomNumberMode::CommonRandom ) @@ -822,6 +898,10 @@ main( int argc, char** argv ) wrkflwtxt += "CURHST+"; else if( rndgen == RandomNumberMode::CurandDevice ) wrkflwtxt += "CURDEV+"; + else if( rndgen == RandomNumberMode::HiprandHost ) + wrkflwtxt += "HIRHST+"; + else if( rndgen == RandomNumberMode::HiprandDevice ) + wrkflwtxt += "HIRDEV+"; else wrkflwtxt += "??????+"; // no path to this statement // -- HOST or DEVICE rambo sampling? @@ -1099,7 +1179,7 @@ main( int argc, char** argv ) #ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif - << "\"Curand generation\": " + << "\"Random generation\": " << "\"" << rndgentxt << "\"," << std::endl; double minelem = hstStats.minME; diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/HiprandRandomNumberKernel.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/HiprandRandomNumberKernel.cc new file mode 120000 index 0000000000..6691864f78 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/HiprandRandomNumberKernel.cc @@ -0,0 +1 @@ +../HiprandRandomNumberKernel.cc \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/check_sa.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/check_sa.cc index bde384c69e..e086ae12d9 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/check_sa.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/check_sa.cc @@ -1,10 +1,10 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2023 CERN and UCLouvain. +// Copyright (C) 2020-2024 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -58,7 +58,7 @@ int usage( char* argv0, int ret = 1 ) { std::cout << "Usage: " << argv0 - << " [--verbose|-v] [--debug|-d] [--performance|-p] [--json|-j] [--curhst|--curdev|--common] [--rmbhst|--rmbdev] [--bridge]" + << " [--verbose|-v] [--debug|-d] [--performance|-p] [--json|-j] [--curhst|--curdev|--hirhst|--hirdev|--common] [--rmbhst|--rmbdev] [--bridge]" << " [#gpuBlocksPerGrid #gpuThreadsPerBlock] #iterations" << std::endl; std::cout << std::endl; std::cout << "The number of events per iteration is #gpuBlocksPerGrid * #gpuThreadsPerBlock" << std::endl; @@ -131,17 +131,31 @@ main( int argc, char** argv ) enum class RandomNumberMode { CommonRandom = 0, - CurandHost = 1, - CurandDevice = 2 + CurandHost = -1, + CurandDevice = 1, + HiprandHost = -2, + HiprandDevice = 2 }; -#ifdef MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785) -#elif defined __HIPCC__ -#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random -#elif defined __CUDACC__ +#if defined __CUDACC__ +#ifndef MGONGPU_HAS_NO_CURAND RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand #else + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on NVidia GPU if build has no curand (PR #784 and #785) +#endif +#elif defined __HIPCC__ +#ifndef MGONGPU_HAS_NO_HIPRAND + RandomNumberMode rndgen = RandomNumberMode::HiprandDevice; // default on AMD GPU if build has hiprand +#else + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on AMD GPU if build has no hiprand +#endif +#else +#ifndef MGONGPU_HAS_NO_CURAND RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand +#elif not defined MGONGPU_HAS_NO_HIPRAND + RandomNumberMode rndgen = RandomNumberMode::HiprandDevice; // default on CPU if build has hiprand +#else + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has neither curand nor hiprand +#endif #endif // Rambo sampling mode (NB RamboHost implies CommonRandom or CurandHost!) enum class RamboSamplingMode @@ -152,7 +166,7 @@ main( int argc, char** argv ) #ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -193,6 +207,26 @@ main( int argc, char** argv ) throw std::runtime_error( "CurandHost is not supported because this application was built without Curand support" ); #else rndgen = RandomNumberMode::CurandHost; +#endif + } + else if( arg == "--hirdev" ) + { +#ifndef __HIPCC__ + throw std::runtime_error( "HiprandDevice is not supported on CPUs or non-AMD GPUs" ); +#elif defined MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "HiprandDevice is not supported because this application was built without Hiprand support" ); +#else + rndgen = RandomNumberMode::HiprandDevice; +#endif + } + else if( arg == "--hirhst" ) + { +#ifdef MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "HiprandHost is not supported because this application was built without Hiprand support" ); +#else + // See https://github.com/ROCm/hipRAND/issues/76 + throw std::runtime_error( "HiprandRandomNumberKernel on host is not supported yet (hiprandCreateGeneratorHost is not implemented yet)" ); + //rndgen = RandomNumberMode::HiprandHost; #endif } else if( arg == "--common" ) @@ -265,6 +299,20 @@ main( int argc, char** argv ) #endif } + if( rmbsmp == RamboSamplingMode::RamboHost && rndgen == RandomNumberMode::HiprandDevice ) + { +#if not defined MGONGPU_HAS_NO_HIPRAND + // See https://github.com/ROCm/hipRAND/issues/76 + //std::cout << "WARNING! RamboHost selected: cannot use HiprandDevice, will use HiprandHost" << std::endl; + //rndgen = RandomNumberMode::HiprandHost; + std::cout << "WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet)" << std::endl; + rndgen = RandomNumberMode::CommonRandom; +#else + std::cout << "WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom" << std::endl; + rndgen = RandomNumberMode::CommonRandom; +#endif + } + constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout constexpr int neppR = MemoryAccessRandomNumbers::neppR; // AOSOA layout @@ -415,7 +463,7 @@ main( int argc, char** argv ) std::unique_ptr wavetimes( new double[niter] ); std::unique_ptr wv3atimes( new double[niter] ); - // --- 0c. Create curand or common generator + // --- 0c. Create curand, hiprand or common generator const std::string cgenKey = "0c GenCreat"; timermap.start( cgenKey ); // Allocate the appropriate RandomNumberKernel @@ -433,7 +481,7 @@ main( int argc, char** argv ) prnk.reset( new CurandRandomNumberKernel( hstRndmom, onDevice ) ); #endif } - else + else if( rndgen == RandomNumberMode::CurandDevice ) { #ifdef MGONGPU_HAS_NO_CURAND throw std::runtime_error( "INTERNAL ERROR! CurandDevice is not supported because this application was built without Curand support" ); // INTERNAL ERROR (no path to this statement) @@ -441,9 +489,31 @@ main( int argc, char** argv ) const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); #else - throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif } + else if( rndgen == RandomNumberMode::HiprandHost ) + { +#ifdef MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "INTERNAL ERROR! HiprandHost is not supported because this application was built without Hiprand support" ); // INTERNAL ERROR (no path to this statement) +#else + const bool onDevice = false; + prnk.reset( new HiprandRandomNumberKernel( hstRndmom, onDevice ) ); +#endif + } + else if( rndgen == RandomNumberMode::HiprandDevice ) + { +#ifdef MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "INTERNAL ERROR! HiprandDevice is not supported because this application was built without Hiprand support" ); // INTERNAL ERROR (no path to this statement) +#elif defined __HIPCC__ + const bool onDevice = true; + prnk.reset( new HiprandRandomNumberKernel( devRndmom, onDevice ) ); +#else + throw std::logic_error( "INTERNAL ERROR! HiprandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) +#endif + } + else + throw std::logic_error( "INTERNAL ERROR! Unknown rndgen value?" ); // INTERNAL ERROR (no path to this statement) // --- 0c. Create rambo sampling kernel [keep this in 0c for the moment] std::unique_ptr prsk; @@ -497,7 +567,7 @@ main( int argc, char** argv ) // *** START THE OLD-STYLE TIMER FOR RANDOM GEN *** double genrtime = 0; - // --- 1a. Seed rnd generator (to get same results on host and device in curand) + // --- 1a. Seed rnd generator (to get same results on host and device in curand/hiprand) // [NB This should not be necessary using the host API: "Generation functions // can be called multiple times on the same generator to generate successive // blocks of results. For pseudorandom generators, multiple calls to generation @@ -515,7 +585,9 @@ main( int argc, char** argv ) //std::cout << "Got random numbers" << std::endl; #ifdef MGONGPUCPP_GPUIMPL - if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) + if( rndgen != RandomNumberMode::CurandDevice && + rndgen != RandomNumberMode::HiprandDevice && + rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device const std::string htodKey = "1c CpHTDrnd"; @@ -761,6 +833,10 @@ main( int argc, char** argv ) rndgentxt = "CURAND HOST"; else if( rndgen == RandomNumberMode::CurandDevice ) rndgentxt = "CURAND DEVICE"; + else if( rndgen == RandomNumberMode::HiprandHost ) + rndgentxt = "ROCRAND HOST"; + else if( rndgen == RandomNumberMode::HiprandDevice ) + rndgentxt = "ROCRAND DEVICE"; #ifdef __CUDACC__ rndgentxt += " (CUDA code)"; #elif defined __HIPCC__ @@ -788,7 +864,7 @@ main( int argc, char** argv ) wrkflwtxt += "FLT+"; #else wrkflwtxt += "???+"; // no path to this statement -#endif /* clang-format on */ +#endif // -- CUCOMPLEX or THRUST or STD or CXSIMPLE complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -813,7 +889,7 @@ main( int argc, char** argv ) wrkflwtxt += "CXS:"; #else wrkflwtxt += "???:"; // no path to this statement -#endif +#endif /* clang-format on */ #endif // -- COMMON or CURAND HOST or CURAND DEVICE random numbers? if( rndgen == RandomNumberMode::CommonRandom ) @@ -822,6 +898,10 @@ main( int argc, char** argv ) wrkflwtxt += "CURHST+"; else if( rndgen == RandomNumberMode::CurandDevice ) wrkflwtxt += "CURDEV+"; + else if( rndgen == RandomNumberMode::HiprandHost ) + wrkflwtxt += "HIRHST+"; + else if( rndgen == RandomNumberMode::HiprandDevice ) + wrkflwtxt += "HIRDEV+"; else wrkflwtxt += "??????+"; // no path to this statement // -- HOST or DEVICE rambo sampling? @@ -1099,7 +1179,7 @@ main( int argc, char** argv ) #ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif - << "\"Curand generation\": " + << "\"Random generation\": " << "\"" << rndgentxt << "\"," << std::endl; double minelem = hstStats.minME; diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/HiprandRandomNumberKernel.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/HiprandRandomNumberKernel.cc new file mode 120000 index 0000000000..6691864f78 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/HiprandRandomNumberKernel.cc @@ -0,0 +1 @@ +../HiprandRandomNumberKernel.cc \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/check_sa.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/check_sa.cc index bde384c69e..e086ae12d9 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/check_sa.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/check_sa.cc @@ -1,10 +1,10 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2023 CERN and UCLouvain. +// Copyright (C) 2020-2024 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -58,7 +58,7 @@ int usage( char* argv0, int ret = 1 ) { std::cout << "Usage: " << argv0 - << " [--verbose|-v] [--debug|-d] [--performance|-p] [--json|-j] [--curhst|--curdev|--common] [--rmbhst|--rmbdev] [--bridge]" + << " [--verbose|-v] [--debug|-d] [--performance|-p] [--json|-j] [--curhst|--curdev|--hirhst|--hirdev|--common] [--rmbhst|--rmbdev] [--bridge]" << " [#gpuBlocksPerGrid #gpuThreadsPerBlock] #iterations" << std::endl; std::cout << std::endl; std::cout << "The number of events per iteration is #gpuBlocksPerGrid * #gpuThreadsPerBlock" << std::endl; @@ -131,17 +131,31 @@ main( int argc, char** argv ) enum class RandomNumberMode { CommonRandom = 0, - CurandHost = 1, - CurandDevice = 2 + CurandHost = -1, + CurandDevice = 1, + HiprandHost = -2, + HiprandDevice = 2 }; -#ifdef MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785) -#elif defined __HIPCC__ -#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random -#elif defined __CUDACC__ +#if defined __CUDACC__ +#ifndef MGONGPU_HAS_NO_CURAND RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand #else + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on NVidia GPU if build has no curand (PR #784 and #785) +#endif +#elif defined __HIPCC__ +#ifndef MGONGPU_HAS_NO_HIPRAND + RandomNumberMode rndgen = RandomNumberMode::HiprandDevice; // default on AMD GPU if build has hiprand +#else + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on AMD GPU if build has no hiprand +#endif +#else +#ifndef MGONGPU_HAS_NO_CURAND RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand +#elif not defined MGONGPU_HAS_NO_HIPRAND + RandomNumberMode rndgen = RandomNumberMode::HiprandDevice; // default on CPU if build has hiprand +#else + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has neither curand nor hiprand +#endif #endif // Rambo sampling mode (NB RamboHost implies CommonRandom or CurandHost!) enum class RamboSamplingMode @@ -152,7 +166,7 @@ main( int argc, char** argv ) #ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -193,6 +207,26 @@ main( int argc, char** argv ) throw std::runtime_error( "CurandHost is not supported because this application was built without Curand support" ); #else rndgen = RandomNumberMode::CurandHost; +#endif + } + else if( arg == "--hirdev" ) + { +#ifndef __HIPCC__ + throw std::runtime_error( "HiprandDevice is not supported on CPUs or non-AMD GPUs" ); +#elif defined MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "HiprandDevice is not supported because this application was built without Hiprand support" ); +#else + rndgen = RandomNumberMode::HiprandDevice; +#endif + } + else if( arg == "--hirhst" ) + { +#ifdef MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "HiprandHost is not supported because this application was built without Hiprand support" ); +#else + // See https://github.com/ROCm/hipRAND/issues/76 + throw std::runtime_error( "HiprandRandomNumberKernel on host is not supported yet (hiprandCreateGeneratorHost is not implemented yet)" ); + //rndgen = RandomNumberMode::HiprandHost; #endif } else if( arg == "--common" ) @@ -265,6 +299,20 @@ main( int argc, char** argv ) #endif } + if( rmbsmp == RamboSamplingMode::RamboHost && rndgen == RandomNumberMode::HiprandDevice ) + { +#if not defined MGONGPU_HAS_NO_HIPRAND + // See https://github.com/ROCm/hipRAND/issues/76 + //std::cout << "WARNING! RamboHost selected: cannot use HiprandDevice, will use HiprandHost" << std::endl; + //rndgen = RandomNumberMode::HiprandHost; + std::cout << "WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet)" << std::endl; + rndgen = RandomNumberMode::CommonRandom; +#else + std::cout << "WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom" << std::endl; + rndgen = RandomNumberMode::CommonRandom; +#endif + } + constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout constexpr int neppR = MemoryAccessRandomNumbers::neppR; // AOSOA layout @@ -415,7 +463,7 @@ main( int argc, char** argv ) std::unique_ptr wavetimes( new double[niter] ); std::unique_ptr wv3atimes( new double[niter] ); - // --- 0c. Create curand or common generator + // --- 0c. Create curand, hiprand or common generator const std::string cgenKey = "0c GenCreat"; timermap.start( cgenKey ); // Allocate the appropriate RandomNumberKernel @@ -433,7 +481,7 @@ main( int argc, char** argv ) prnk.reset( new CurandRandomNumberKernel( hstRndmom, onDevice ) ); #endif } - else + else if( rndgen == RandomNumberMode::CurandDevice ) { #ifdef MGONGPU_HAS_NO_CURAND throw std::runtime_error( "INTERNAL ERROR! CurandDevice is not supported because this application was built without Curand support" ); // INTERNAL ERROR (no path to this statement) @@ -441,9 +489,31 @@ main( int argc, char** argv ) const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); #else - throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif } + else if( rndgen == RandomNumberMode::HiprandHost ) + { +#ifdef MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "INTERNAL ERROR! HiprandHost is not supported because this application was built without Hiprand support" ); // INTERNAL ERROR (no path to this statement) +#else + const bool onDevice = false; + prnk.reset( new HiprandRandomNumberKernel( hstRndmom, onDevice ) ); +#endif + } + else if( rndgen == RandomNumberMode::HiprandDevice ) + { +#ifdef MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "INTERNAL ERROR! HiprandDevice is not supported because this application was built without Hiprand support" ); // INTERNAL ERROR (no path to this statement) +#elif defined __HIPCC__ + const bool onDevice = true; + prnk.reset( new HiprandRandomNumberKernel( devRndmom, onDevice ) ); +#else + throw std::logic_error( "INTERNAL ERROR! HiprandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) +#endif + } + else + throw std::logic_error( "INTERNAL ERROR! Unknown rndgen value?" ); // INTERNAL ERROR (no path to this statement) // --- 0c. Create rambo sampling kernel [keep this in 0c for the moment] std::unique_ptr prsk; @@ -497,7 +567,7 @@ main( int argc, char** argv ) // *** START THE OLD-STYLE TIMER FOR RANDOM GEN *** double genrtime = 0; - // --- 1a. Seed rnd generator (to get same results on host and device in curand) + // --- 1a. Seed rnd generator (to get same results on host and device in curand/hiprand) // [NB This should not be necessary using the host API: "Generation functions // can be called multiple times on the same generator to generate successive // blocks of results. For pseudorandom generators, multiple calls to generation @@ -515,7 +585,9 @@ main( int argc, char** argv ) //std::cout << "Got random numbers" << std::endl; #ifdef MGONGPUCPP_GPUIMPL - if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) + if( rndgen != RandomNumberMode::CurandDevice && + rndgen != RandomNumberMode::HiprandDevice && + rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device const std::string htodKey = "1c CpHTDrnd"; @@ -761,6 +833,10 @@ main( int argc, char** argv ) rndgentxt = "CURAND HOST"; else if( rndgen == RandomNumberMode::CurandDevice ) rndgentxt = "CURAND DEVICE"; + else if( rndgen == RandomNumberMode::HiprandHost ) + rndgentxt = "ROCRAND HOST"; + else if( rndgen == RandomNumberMode::HiprandDevice ) + rndgentxt = "ROCRAND DEVICE"; #ifdef __CUDACC__ rndgentxt += " (CUDA code)"; #elif defined __HIPCC__ @@ -788,7 +864,7 @@ main( int argc, char** argv ) wrkflwtxt += "FLT+"; #else wrkflwtxt += "???+"; // no path to this statement -#endif /* clang-format on */ +#endif // -- CUCOMPLEX or THRUST or STD or CXSIMPLE complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -813,7 +889,7 @@ main( int argc, char** argv ) wrkflwtxt += "CXS:"; #else wrkflwtxt += "???:"; // no path to this statement -#endif +#endif /* clang-format on */ #endif // -- COMMON or CURAND HOST or CURAND DEVICE random numbers? if( rndgen == RandomNumberMode::CommonRandom ) @@ -822,6 +898,10 @@ main( int argc, char** argv ) wrkflwtxt += "CURHST+"; else if( rndgen == RandomNumberMode::CurandDevice ) wrkflwtxt += "CURDEV+"; + else if( rndgen == RandomNumberMode::HiprandHost ) + wrkflwtxt += "HIRHST+"; + else if( rndgen == RandomNumberMode::HiprandDevice ) + wrkflwtxt += "HIRDEV+"; else wrkflwtxt += "??????+"; // no path to this statement // -- HOST or DEVICE rambo sampling? @@ -1099,7 +1179,7 @@ main( int argc, char** argv ) #ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif - << "\"Curand generation\": " + << "\"Random generation\": " << "\"" << rndgentxt << "\"," << std::endl; double minelem = hstStats.minME; diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/HiprandRandomNumberKernel.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/HiprandRandomNumberKernel.cc new file mode 120000 index 0000000000..6691864f78 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/HiprandRandomNumberKernel.cc @@ -0,0 +1 @@ +../HiprandRandomNumberKernel.cc \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/check_sa.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/check_sa.cc index bde384c69e..e086ae12d9 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/check_sa.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/check_sa.cc @@ -1,10 +1,10 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2023 CERN and UCLouvain. +// Copyright (C) 2020-2024 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -58,7 +58,7 @@ int usage( char* argv0, int ret = 1 ) { std::cout << "Usage: " << argv0 - << " [--verbose|-v] [--debug|-d] [--performance|-p] [--json|-j] [--curhst|--curdev|--common] [--rmbhst|--rmbdev] [--bridge]" + << " [--verbose|-v] [--debug|-d] [--performance|-p] [--json|-j] [--curhst|--curdev|--hirhst|--hirdev|--common] [--rmbhst|--rmbdev] [--bridge]" << " [#gpuBlocksPerGrid #gpuThreadsPerBlock] #iterations" << std::endl; std::cout << std::endl; std::cout << "The number of events per iteration is #gpuBlocksPerGrid * #gpuThreadsPerBlock" << std::endl; @@ -131,17 +131,31 @@ main( int argc, char** argv ) enum class RandomNumberMode { CommonRandom = 0, - CurandHost = 1, - CurandDevice = 2 + CurandHost = -1, + CurandDevice = 1, + HiprandHost = -2, + HiprandDevice = 2 }; -#ifdef MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785) -#elif defined __HIPCC__ -#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random -#elif defined __CUDACC__ +#if defined __CUDACC__ +#ifndef MGONGPU_HAS_NO_CURAND RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand #else + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on NVidia GPU if build has no curand (PR #784 and #785) +#endif +#elif defined __HIPCC__ +#ifndef MGONGPU_HAS_NO_HIPRAND + RandomNumberMode rndgen = RandomNumberMode::HiprandDevice; // default on AMD GPU if build has hiprand +#else + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on AMD GPU if build has no hiprand +#endif +#else +#ifndef MGONGPU_HAS_NO_CURAND RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand +#elif not defined MGONGPU_HAS_NO_HIPRAND + RandomNumberMode rndgen = RandomNumberMode::HiprandDevice; // default on CPU if build has hiprand +#else + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has neither curand nor hiprand +#endif #endif // Rambo sampling mode (NB RamboHost implies CommonRandom or CurandHost!) enum class RamboSamplingMode @@ -152,7 +166,7 @@ main( int argc, char** argv ) #ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -193,6 +207,26 @@ main( int argc, char** argv ) throw std::runtime_error( "CurandHost is not supported because this application was built without Curand support" ); #else rndgen = RandomNumberMode::CurandHost; +#endif + } + else if( arg == "--hirdev" ) + { +#ifndef __HIPCC__ + throw std::runtime_error( "HiprandDevice is not supported on CPUs or non-AMD GPUs" ); +#elif defined MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "HiprandDevice is not supported because this application was built without Hiprand support" ); +#else + rndgen = RandomNumberMode::HiprandDevice; +#endif + } + else if( arg == "--hirhst" ) + { +#ifdef MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "HiprandHost is not supported because this application was built without Hiprand support" ); +#else + // See https://github.com/ROCm/hipRAND/issues/76 + throw std::runtime_error( "HiprandRandomNumberKernel on host is not supported yet (hiprandCreateGeneratorHost is not implemented yet)" ); + //rndgen = RandomNumberMode::HiprandHost; #endif } else if( arg == "--common" ) @@ -265,6 +299,20 @@ main( int argc, char** argv ) #endif } + if( rmbsmp == RamboSamplingMode::RamboHost && rndgen == RandomNumberMode::HiprandDevice ) + { +#if not defined MGONGPU_HAS_NO_HIPRAND + // See https://github.com/ROCm/hipRAND/issues/76 + //std::cout << "WARNING! RamboHost selected: cannot use HiprandDevice, will use HiprandHost" << std::endl; + //rndgen = RandomNumberMode::HiprandHost; + std::cout << "WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet)" << std::endl; + rndgen = RandomNumberMode::CommonRandom; +#else + std::cout << "WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom" << std::endl; + rndgen = RandomNumberMode::CommonRandom; +#endif + } + constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout constexpr int neppR = MemoryAccessRandomNumbers::neppR; // AOSOA layout @@ -415,7 +463,7 @@ main( int argc, char** argv ) std::unique_ptr wavetimes( new double[niter] ); std::unique_ptr wv3atimes( new double[niter] ); - // --- 0c. Create curand or common generator + // --- 0c. Create curand, hiprand or common generator const std::string cgenKey = "0c GenCreat"; timermap.start( cgenKey ); // Allocate the appropriate RandomNumberKernel @@ -433,7 +481,7 @@ main( int argc, char** argv ) prnk.reset( new CurandRandomNumberKernel( hstRndmom, onDevice ) ); #endif } - else + else if( rndgen == RandomNumberMode::CurandDevice ) { #ifdef MGONGPU_HAS_NO_CURAND throw std::runtime_error( "INTERNAL ERROR! CurandDevice is not supported because this application was built without Curand support" ); // INTERNAL ERROR (no path to this statement) @@ -441,9 +489,31 @@ main( int argc, char** argv ) const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); #else - throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif } + else if( rndgen == RandomNumberMode::HiprandHost ) + { +#ifdef MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "INTERNAL ERROR! HiprandHost is not supported because this application was built without Hiprand support" ); // INTERNAL ERROR (no path to this statement) +#else + const bool onDevice = false; + prnk.reset( new HiprandRandomNumberKernel( hstRndmom, onDevice ) ); +#endif + } + else if( rndgen == RandomNumberMode::HiprandDevice ) + { +#ifdef MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "INTERNAL ERROR! HiprandDevice is not supported because this application was built without Hiprand support" ); // INTERNAL ERROR (no path to this statement) +#elif defined __HIPCC__ + const bool onDevice = true; + prnk.reset( new HiprandRandomNumberKernel( devRndmom, onDevice ) ); +#else + throw std::logic_error( "INTERNAL ERROR! HiprandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) +#endif + } + else + throw std::logic_error( "INTERNAL ERROR! Unknown rndgen value?" ); // INTERNAL ERROR (no path to this statement) // --- 0c. Create rambo sampling kernel [keep this in 0c for the moment] std::unique_ptr prsk; @@ -497,7 +567,7 @@ main( int argc, char** argv ) // *** START THE OLD-STYLE TIMER FOR RANDOM GEN *** double genrtime = 0; - // --- 1a. Seed rnd generator (to get same results on host and device in curand) + // --- 1a. Seed rnd generator (to get same results on host and device in curand/hiprand) // [NB This should not be necessary using the host API: "Generation functions // can be called multiple times on the same generator to generate successive // blocks of results. For pseudorandom generators, multiple calls to generation @@ -515,7 +585,9 @@ main( int argc, char** argv ) //std::cout << "Got random numbers" << std::endl; #ifdef MGONGPUCPP_GPUIMPL - if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) + if( rndgen != RandomNumberMode::CurandDevice && + rndgen != RandomNumberMode::HiprandDevice && + rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device const std::string htodKey = "1c CpHTDrnd"; @@ -761,6 +833,10 @@ main( int argc, char** argv ) rndgentxt = "CURAND HOST"; else if( rndgen == RandomNumberMode::CurandDevice ) rndgentxt = "CURAND DEVICE"; + else if( rndgen == RandomNumberMode::HiprandHost ) + rndgentxt = "ROCRAND HOST"; + else if( rndgen == RandomNumberMode::HiprandDevice ) + rndgentxt = "ROCRAND DEVICE"; #ifdef __CUDACC__ rndgentxt += " (CUDA code)"; #elif defined __HIPCC__ @@ -788,7 +864,7 @@ main( int argc, char** argv ) wrkflwtxt += "FLT+"; #else wrkflwtxt += "???+"; // no path to this statement -#endif /* clang-format on */ +#endif // -- CUCOMPLEX or THRUST or STD or CXSIMPLE complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -813,7 +889,7 @@ main( int argc, char** argv ) wrkflwtxt += "CXS:"; #else wrkflwtxt += "???:"; // no path to this statement -#endif +#endif /* clang-format on */ #endif // -- COMMON or CURAND HOST or CURAND DEVICE random numbers? if( rndgen == RandomNumberMode::CommonRandom ) @@ -822,6 +898,10 @@ main( int argc, char** argv ) wrkflwtxt += "CURHST+"; else if( rndgen == RandomNumberMode::CurandDevice ) wrkflwtxt += "CURDEV+"; + else if( rndgen == RandomNumberMode::HiprandHost ) + wrkflwtxt += "HIRHST+"; + else if( rndgen == RandomNumberMode::HiprandDevice ) + wrkflwtxt += "HIRDEV+"; else wrkflwtxt += "??????+"; // no path to this statement // -- HOST or DEVICE rambo sampling? @@ -1099,7 +1179,7 @@ main( int argc, char** argv ) #ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif - << "\"Curand generation\": " + << "\"Random generation\": " << "\"" << rndgentxt << "\"," << std::endl; double minelem = hstStats.minME; diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/HiprandRandomNumberKernel.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/HiprandRandomNumberKernel.cc new file mode 120000 index 0000000000..6691864f78 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/HiprandRandomNumberKernel.cc @@ -0,0 +1 @@ +../HiprandRandomNumberKernel.cc \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/check_sa.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/check_sa.cc index bde384c69e..e086ae12d9 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/check_sa.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/check_sa.cc @@ -1,10 +1,10 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2023 CERN and UCLouvain. +// Copyright (C) 2020-2024 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -58,7 +58,7 @@ int usage( char* argv0, int ret = 1 ) { std::cout << "Usage: " << argv0 - << " [--verbose|-v] [--debug|-d] [--performance|-p] [--json|-j] [--curhst|--curdev|--common] [--rmbhst|--rmbdev] [--bridge]" + << " [--verbose|-v] [--debug|-d] [--performance|-p] [--json|-j] [--curhst|--curdev|--hirhst|--hirdev|--common] [--rmbhst|--rmbdev] [--bridge]" << " [#gpuBlocksPerGrid #gpuThreadsPerBlock] #iterations" << std::endl; std::cout << std::endl; std::cout << "The number of events per iteration is #gpuBlocksPerGrid * #gpuThreadsPerBlock" << std::endl; @@ -131,17 +131,31 @@ main( int argc, char** argv ) enum class RandomNumberMode { CommonRandom = 0, - CurandHost = 1, - CurandDevice = 2 + CurandHost = -1, + CurandDevice = 1, + HiprandHost = -2, + HiprandDevice = 2 }; -#ifdef MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785) -#elif defined __HIPCC__ -#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random -#elif defined __CUDACC__ +#if defined __CUDACC__ +#ifndef MGONGPU_HAS_NO_CURAND RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand #else + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on NVidia GPU if build has no curand (PR #784 and #785) +#endif +#elif defined __HIPCC__ +#ifndef MGONGPU_HAS_NO_HIPRAND + RandomNumberMode rndgen = RandomNumberMode::HiprandDevice; // default on AMD GPU if build has hiprand +#else + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on AMD GPU if build has no hiprand +#endif +#else +#ifndef MGONGPU_HAS_NO_CURAND RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand +#elif not defined MGONGPU_HAS_NO_HIPRAND + RandomNumberMode rndgen = RandomNumberMode::HiprandDevice; // default on CPU if build has hiprand +#else + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has neither curand nor hiprand +#endif #endif // Rambo sampling mode (NB RamboHost implies CommonRandom or CurandHost!) enum class RamboSamplingMode @@ -152,7 +166,7 @@ main( int argc, char** argv ) #ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -193,6 +207,26 @@ main( int argc, char** argv ) throw std::runtime_error( "CurandHost is not supported because this application was built without Curand support" ); #else rndgen = RandomNumberMode::CurandHost; +#endif + } + else if( arg == "--hirdev" ) + { +#ifndef __HIPCC__ + throw std::runtime_error( "HiprandDevice is not supported on CPUs or non-AMD GPUs" ); +#elif defined MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "HiprandDevice is not supported because this application was built without Hiprand support" ); +#else + rndgen = RandomNumberMode::HiprandDevice; +#endif + } + else if( arg == "--hirhst" ) + { +#ifdef MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "HiprandHost is not supported because this application was built without Hiprand support" ); +#else + // See https://github.com/ROCm/hipRAND/issues/76 + throw std::runtime_error( "HiprandRandomNumberKernel on host is not supported yet (hiprandCreateGeneratorHost is not implemented yet)" ); + //rndgen = RandomNumberMode::HiprandHost; #endif } else if( arg == "--common" ) @@ -265,6 +299,20 @@ main( int argc, char** argv ) #endif } + if( rmbsmp == RamboSamplingMode::RamboHost && rndgen == RandomNumberMode::HiprandDevice ) + { +#if not defined MGONGPU_HAS_NO_HIPRAND + // See https://github.com/ROCm/hipRAND/issues/76 + //std::cout << "WARNING! RamboHost selected: cannot use HiprandDevice, will use HiprandHost" << std::endl; + //rndgen = RandomNumberMode::HiprandHost; + std::cout << "WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet)" << std::endl; + rndgen = RandomNumberMode::CommonRandom; +#else + std::cout << "WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom" << std::endl; + rndgen = RandomNumberMode::CommonRandom; +#endif + } + constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout constexpr int neppR = MemoryAccessRandomNumbers::neppR; // AOSOA layout @@ -415,7 +463,7 @@ main( int argc, char** argv ) std::unique_ptr wavetimes( new double[niter] ); std::unique_ptr wv3atimes( new double[niter] ); - // --- 0c. Create curand or common generator + // --- 0c. Create curand, hiprand or common generator const std::string cgenKey = "0c GenCreat"; timermap.start( cgenKey ); // Allocate the appropriate RandomNumberKernel @@ -433,7 +481,7 @@ main( int argc, char** argv ) prnk.reset( new CurandRandomNumberKernel( hstRndmom, onDevice ) ); #endif } - else + else if( rndgen == RandomNumberMode::CurandDevice ) { #ifdef MGONGPU_HAS_NO_CURAND throw std::runtime_error( "INTERNAL ERROR! CurandDevice is not supported because this application was built without Curand support" ); // INTERNAL ERROR (no path to this statement) @@ -441,9 +489,31 @@ main( int argc, char** argv ) const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); #else - throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif } + else if( rndgen == RandomNumberMode::HiprandHost ) + { +#ifdef MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "INTERNAL ERROR! HiprandHost is not supported because this application was built without Hiprand support" ); // INTERNAL ERROR (no path to this statement) +#else + const bool onDevice = false; + prnk.reset( new HiprandRandomNumberKernel( hstRndmom, onDevice ) ); +#endif + } + else if( rndgen == RandomNumberMode::HiprandDevice ) + { +#ifdef MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "INTERNAL ERROR! HiprandDevice is not supported because this application was built without Hiprand support" ); // INTERNAL ERROR (no path to this statement) +#elif defined __HIPCC__ + const bool onDevice = true; + prnk.reset( new HiprandRandomNumberKernel( devRndmom, onDevice ) ); +#else + throw std::logic_error( "INTERNAL ERROR! HiprandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) +#endif + } + else + throw std::logic_error( "INTERNAL ERROR! Unknown rndgen value?" ); // INTERNAL ERROR (no path to this statement) // --- 0c. Create rambo sampling kernel [keep this in 0c for the moment] std::unique_ptr prsk; @@ -497,7 +567,7 @@ main( int argc, char** argv ) // *** START THE OLD-STYLE TIMER FOR RANDOM GEN *** double genrtime = 0; - // --- 1a. Seed rnd generator (to get same results on host and device in curand) + // --- 1a. Seed rnd generator (to get same results on host and device in curand/hiprand) // [NB This should not be necessary using the host API: "Generation functions // can be called multiple times on the same generator to generate successive // blocks of results. For pseudorandom generators, multiple calls to generation @@ -515,7 +585,9 @@ main( int argc, char** argv ) //std::cout << "Got random numbers" << std::endl; #ifdef MGONGPUCPP_GPUIMPL - if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) + if( rndgen != RandomNumberMode::CurandDevice && + rndgen != RandomNumberMode::HiprandDevice && + rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device const std::string htodKey = "1c CpHTDrnd"; @@ -761,6 +833,10 @@ main( int argc, char** argv ) rndgentxt = "CURAND HOST"; else if( rndgen == RandomNumberMode::CurandDevice ) rndgentxt = "CURAND DEVICE"; + else if( rndgen == RandomNumberMode::HiprandHost ) + rndgentxt = "ROCRAND HOST"; + else if( rndgen == RandomNumberMode::HiprandDevice ) + rndgentxt = "ROCRAND DEVICE"; #ifdef __CUDACC__ rndgentxt += " (CUDA code)"; #elif defined __HIPCC__ @@ -788,7 +864,7 @@ main( int argc, char** argv ) wrkflwtxt += "FLT+"; #else wrkflwtxt += "???+"; // no path to this statement -#endif /* clang-format on */ +#endif // -- CUCOMPLEX or THRUST or STD or CXSIMPLE complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -813,7 +889,7 @@ main( int argc, char** argv ) wrkflwtxt += "CXS:"; #else wrkflwtxt += "???:"; // no path to this statement -#endif +#endif /* clang-format on */ #endif // -- COMMON or CURAND HOST or CURAND DEVICE random numbers? if( rndgen == RandomNumberMode::CommonRandom ) @@ -822,6 +898,10 @@ main( int argc, char** argv ) wrkflwtxt += "CURHST+"; else if( rndgen == RandomNumberMode::CurandDevice ) wrkflwtxt += "CURDEV+"; + else if( rndgen == RandomNumberMode::HiprandHost ) + wrkflwtxt += "HIRHST+"; + else if( rndgen == RandomNumberMode::HiprandDevice ) + wrkflwtxt += "HIRDEV+"; else wrkflwtxt += "??????+"; // no path to this statement // -- HOST or DEVICE rambo sampling? @@ -1099,7 +1179,7 @@ main( int argc, char** argv ) #ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif - << "\"Curand generation\": " + << "\"Random generation\": " << "\"" << rndgentxt << "\"," << std::endl; double minelem = hstStats.minME; diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/HiprandRandomNumberKernel.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/HiprandRandomNumberKernel.cc new file mode 120000 index 0000000000..6691864f78 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/HiprandRandomNumberKernel.cc @@ -0,0 +1 @@ +../HiprandRandomNumberKernel.cc \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/check_sa.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/check_sa.cc index bde384c69e..e086ae12d9 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/check_sa.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/check_sa.cc @@ -1,10 +1,10 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2023 CERN and UCLouvain. +// Copyright (C) 2020-2024 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -58,7 +58,7 @@ int usage( char* argv0, int ret = 1 ) { std::cout << "Usage: " << argv0 - << " [--verbose|-v] [--debug|-d] [--performance|-p] [--json|-j] [--curhst|--curdev|--common] [--rmbhst|--rmbdev] [--bridge]" + << " [--verbose|-v] [--debug|-d] [--performance|-p] [--json|-j] [--curhst|--curdev|--hirhst|--hirdev|--common] [--rmbhst|--rmbdev] [--bridge]" << " [#gpuBlocksPerGrid #gpuThreadsPerBlock] #iterations" << std::endl; std::cout << std::endl; std::cout << "The number of events per iteration is #gpuBlocksPerGrid * #gpuThreadsPerBlock" << std::endl; @@ -131,17 +131,31 @@ main( int argc, char** argv ) enum class RandomNumberMode { CommonRandom = 0, - CurandHost = 1, - CurandDevice = 2 + CurandHost = -1, + CurandDevice = 1, + HiprandHost = -2, + HiprandDevice = 2 }; -#ifdef MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785) -#elif defined __HIPCC__ -#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random -#elif defined __CUDACC__ +#if defined __CUDACC__ +#ifndef MGONGPU_HAS_NO_CURAND RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand #else + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on NVidia GPU if build has no curand (PR #784 and #785) +#endif +#elif defined __HIPCC__ +#ifndef MGONGPU_HAS_NO_HIPRAND + RandomNumberMode rndgen = RandomNumberMode::HiprandDevice; // default on AMD GPU if build has hiprand +#else + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on AMD GPU if build has no hiprand +#endif +#else +#ifndef MGONGPU_HAS_NO_CURAND RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand +#elif not defined MGONGPU_HAS_NO_HIPRAND + RandomNumberMode rndgen = RandomNumberMode::HiprandDevice; // default on CPU if build has hiprand +#else + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has neither curand nor hiprand +#endif #endif // Rambo sampling mode (NB RamboHost implies CommonRandom or CurandHost!) enum class RamboSamplingMode @@ -152,7 +166,7 @@ main( int argc, char** argv ) #ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -193,6 +207,26 @@ main( int argc, char** argv ) throw std::runtime_error( "CurandHost is not supported because this application was built without Curand support" ); #else rndgen = RandomNumberMode::CurandHost; +#endif + } + else if( arg == "--hirdev" ) + { +#ifndef __HIPCC__ + throw std::runtime_error( "HiprandDevice is not supported on CPUs or non-AMD GPUs" ); +#elif defined MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "HiprandDevice is not supported because this application was built without Hiprand support" ); +#else + rndgen = RandomNumberMode::HiprandDevice; +#endif + } + else if( arg == "--hirhst" ) + { +#ifdef MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "HiprandHost is not supported because this application was built without Hiprand support" ); +#else + // See https://github.com/ROCm/hipRAND/issues/76 + throw std::runtime_error( "HiprandRandomNumberKernel on host is not supported yet (hiprandCreateGeneratorHost is not implemented yet)" ); + //rndgen = RandomNumberMode::HiprandHost; #endif } else if( arg == "--common" ) @@ -265,6 +299,20 @@ main( int argc, char** argv ) #endif } + if( rmbsmp == RamboSamplingMode::RamboHost && rndgen == RandomNumberMode::HiprandDevice ) + { +#if not defined MGONGPU_HAS_NO_HIPRAND + // See https://github.com/ROCm/hipRAND/issues/76 + //std::cout << "WARNING! RamboHost selected: cannot use HiprandDevice, will use HiprandHost" << std::endl; + //rndgen = RandomNumberMode::HiprandHost; + std::cout << "WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet)" << std::endl; + rndgen = RandomNumberMode::CommonRandom; +#else + std::cout << "WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom" << std::endl; + rndgen = RandomNumberMode::CommonRandom; +#endif + } + constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout constexpr int neppR = MemoryAccessRandomNumbers::neppR; // AOSOA layout @@ -415,7 +463,7 @@ main( int argc, char** argv ) std::unique_ptr wavetimes( new double[niter] ); std::unique_ptr wv3atimes( new double[niter] ); - // --- 0c. Create curand or common generator + // --- 0c. Create curand, hiprand or common generator const std::string cgenKey = "0c GenCreat"; timermap.start( cgenKey ); // Allocate the appropriate RandomNumberKernel @@ -433,7 +481,7 @@ main( int argc, char** argv ) prnk.reset( new CurandRandomNumberKernel( hstRndmom, onDevice ) ); #endif } - else + else if( rndgen == RandomNumberMode::CurandDevice ) { #ifdef MGONGPU_HAS_NO_CURAND throw std::runtime_error( "INTERNAL ERROR! CurandDevice is not supported because this application was built without Curand support" ); // INTERNAL ERROR (no path to this statement) @@ -441,9 +489,31 @@ main( int argc, char** argv ) const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); #else - throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif } + else if( rndgen == RandomNumberMode::HiprandHost ) + { +#ifdef MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "INTERNAL ERROR! HiprandHost is not supported because this application was built without Hiprand support" ); // INTERNAL ERROR (no path to this statement) +#else + const bool onDevice = false; + prnk.reset( new HiprandRandomNumberKernel( hstRndmom, onDevice ) ); +#endif + } + else if( rndgen == RandomNumberMode::HiprandDevice ) + { +#ifdef MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "INTERNAL ERROR! HiprandDevice is not supported because this application was built without Hiprand support" ); // INTERNAL ERROR (no path to this statement) +#elif defined __HIPCC__ + const bool onDevice = true; + prnk.reset( new HiprandRandomNumberKernel( devRndmom, onDevice ) ); +#else + throw std::logic_error( "INTERNAL ERROR! HiprandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) +#endif + } + else + throw std::logic_error( "INTERNAL ERROR! Unknown rndgen value?" ); // INTERNAL ERROR (no path to this statement) // --- 0c. Create rambo sampling kernel [keep this in 0c for the moment] std::unique_ptr prsk; @@ -497,7 +567,7 @@ main( int argc, char** argv ) // *** START THE OLD-STYLE TIMER FOR RANDOM GEN *** double genrtime = 0; - // --- 1a. Seed rnd generator (to get same results on host and device in curand) + // --- 1a. Seed rnd generator (to get same results on host and device in curand/hiprand) // [NB This should not be necessary using the host API: "Generation functions // can be called multiple times on the same generator to generate successive // blocks of results. For pseudorandom generators, multiple calls to generation @@ -515,7 +585,9 @@ main( int argc, char** argv ) //std::cout << "Got random numbers" << std::endl; #ifdef MGONGPUCPP_GPUIMPL - if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) + if( rndgen != RandomNumberMode::CurandDevice && + rndgen != RandomNumberMode::HiprandDevice && + rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device const std::string htodKey = "1c CpHTDrnd"; @@ -761,6 +833,10 @@ main( int argc, char** argv ) rndgentxt = "CURAND HOST"; else if( rndgen == RandomNumberMode::CurandDevice ) rndgentxt = "CURAND DEVICE"; + else if( rndgen == RandomNumberMode::HiprandHost ) + rndgentxt = "ROCRAND HOST"; + else if( rndgen == RandomNumberMode::HiprandDevice ) + rndgentxt = "ROCRAND DEVICE"; #ifdef __CUDACC__ rndgentxt += " (CUDA code)"; #elif defined __HIPCC__ @@ -788,7 +864,7 @@ main( int argc, char** argv ) wrkflwtxt += "FLT+"; #else wrkflwtxt += "???+"; // no path to this statement -#endif /* clang-format on */ +#endif // -- CUCOMPLEX or THRUST or STD or CXSIMPLE complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -813,7 +889,7 @@ main( int argc, char** argv ) wrkflwtxt += "CXS:"; #else wrkflwtxt += "???:"; // no path to this statement -#endif +#endif /* clang-format on */ #endif // -- COMMON or CURAND HOST or CURAND DEVICE random numbers? if( rndgen == RandomNumberMode::CommonRandom ) @@ -822,6 +898,10 @@ main( int argc, char** argv ) wrkflwtxt += "CURHST+"; else if( rndgen == RandomNumberMode::CurandDevice ) wrkflwtxt += "CURDEV+"; + else if( rndgen == RandomNumberMode::HiprandHost ) + wrkflwtxt += "HIRHST+"; + else if( rndgen == RandomNumberMode::HiprandDevice ) + wrkflwtxt += "HIRDEV+"; else wrkflwtxt += "??????+"; // no path to this statement // -- HOST or DEVICE rambo sampling? @@ -1099,7 +1179,7 @@ main( int argc, char** argv ) #ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif - << "\"Curand generation\": " + << "\"Random generation\": " << "\"" << rndgentxt << "\"," << std::endl; double minelem = hstStats.minME; diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/HiprandRandomNumberKernel.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/HiprandRandomNumberKernel.cc new file mode 120000 index 0000000000..6691864f78 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/HiprandRandomNumberKernel.cc @@ -0,0 +1 @@ +../HiprandRandomNumberKernel.cc \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/check_sa.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/check_sa.cc index bde384c69e..e086ae12d9 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/check_sa.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/check_sa.cc @@ -1,10 +1,10 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2023 CERN and UCLouvain. +// Copyright (C) 2020-2024 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -58,7 +58,7 @@ int usage( char* argv0, int ret = 1 ) { std::cout << "Usage: " << argv0 - << " [--verbose|-v] [--debug|-d] [--performance|-p] [--json|-j] [--curhst|--curdev|--common] [--rmbhst|--rmbdev] [--bridge]" + << " [--verbose|-v] [--debug|-d] [--performance|-p] [--json|-j] [--curhst|--curdev|--hirhst|--hirdev|--common] [--rmbhst|--rmbdev] [--bridge]" << " [#gpuBlocksPerGrid #gpuThreadsPerBlock] #iterations" << std::endl; std::cout << std::endl; std::cout << "The number of events per iteration is #gpuBlocksPerGrid * #gpuThreadsPerBlock" << std::endl; @@ -131,17 +131,31 @@ main( int argc, char** argv ) enum class RandomNumberMode { CommonRandom = 0, - CurandHost = 1, - CurandDevice = 2 + CurandHost = -1, + CurandDevice = 1, + HiprandHost = -2, + HiprandDevice = 2 }; -#ifdef MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785) -#elif defined __HIPCC__ -#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random -#elif defined __CUDACC__ +#if defined __CUDACC__ +#ifndef MGONGPU_HAS_NO_CURAND RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand #else + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on NVidia GPU if build has no curand (PR #784 and #785) +#endif +#elif defined __HIPCC__ +#ifndef MGONGPU_HAS_NO_HIPRAND + RandomNumberMode rndgen = RandomNumberMode::HiprandDevice; // default on AMD GPU if build has hiprand +#else + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on AMD GPU if build has no hiprand +#endif +#else +#ifndef MGONGPU_HAS_NO_CURAND RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand +#elif not defined MGONGPU_HAS_NO_HIPRAND + RandomNumberMode rndgen = RandomNumberMode::HiprandDevice; // default on CPU if build has hiprand +#else + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has neither curand nor hiprand +#endif #endif // Rambo sampling mode (NB RamboHost implies CommonRandom or CurandHost!) enum class RamboSamplingMode @@ -152,7 +166,7 @@ main( int argc, char** argv ) #ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -193,6 +207,26 @@ main( int argc, char** argv ) throw std::runtime_error( "CurandHost is not supported because this application was built without Curand support" ); #else rndgen = RandomNumberMode::CurandHost; +#endif + } + else if( arg == "--hirdev" ) + { +#ifndef __HIPCC__ + throw std::runtime_error( "HiprandDevice is not supported on CPUs or non-AMD GPUs" ); +#elif defined MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "HiprandDevice is not supported because this application was built without Hiprand support" ); +#else + rndgen = RandomNumberMode::HiprandDevice; +#endif + } + else if( arg == "--hirhst" ) + { +#ifdef MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "HiprandHost is not supported because this application was built without Hiprand support" ); +#else + // See https://github.com/ROCm/hipRAND/issues/76 + throw std::runtime_error( "HiprandRandomNumberKernel on host is not supported yet (hiprandCreateGeneratorHost is not implemented yet)" ); + //rndgen = RandomNumberMode::HiprandHost; #endif } else if( arg == "--common" ) @@ -265,6 +299,20 @@ main( int argc, char** argv ) #endif } + if( rmbsmp == RamboSamplingMode::RamboHost && rndgen == RandomNumberMode::HiprandDevice ) + { +#if not defined MGONGPU_HAS_NO_HIPRAND + // See https://github.com/ROCm/hipRAND/issues/76 + //std::cout << "WARNING! RamboHost selected: cannot use HiprandDevice, will use HiprandHost" << std::endl; + //rndgen = RandomNumberMode::HiprandHost; + std::cout << "WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet)" << std::endl; + rndgen = RandomNumberMode::CommonRandom; +#else + std::cout << "WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom" << std::endl; + rndgen = RandomNumberMode::CommonRandom; +#endif + } + constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout constexpr int neppR = MemoryAccessRandomNumbers::neppR; // AOSOA layout @@ -415,7 +463,7 @@ main( int argc, char** argv ) std::unique_ptr wavetimes( new double[niter] ); std::unique_ptr wv3atimes( new double[niter] ); - // --- 0c. Create curand or common generator + // --- 0c. Create curand, hiprand or common generator const std::string cgenKey = "0c GenCreat"; timermap.start( cgenKey ); // Allocate the appropriate RandomNumberKernel @@ -433,7 +481,7 @@ main( int argc, char** argv ) prnk.reset( new CurandRandomNumberKernel( hstRndmom, onDevice ) ); #endif } - else + else if( rndgen == RandomNumberMode::CurandDevice ) { #ifdef MGONGPU_HAS_NO_CURAND throw std::runtime_error( "INTERNAL ERROR! CurandDevice is not supported because this application was built without Curand support" ); // INTERNAL ERROR (no path to this statement) @@ -441,9 +489,31 @@ main( int argc, char** argv ) const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); #else - throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif } + else if( rndgen == RandomNumberMode::HiprandHost ) + { +#ifdef MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "INTERNAL ERROR! HiprandHost is not supported because this application was built without Hiprand support" ); // INTERNAL ERROR (no path to this statement) +#else + const bool onDevice = false; + prnk.reset( new HiprandRandomNumberKernel( hstRndmom, onDevice ) ); +#endif + } + else if( rndgen == RandomNumberMode::HiprandDevice ) + { +#ifdef MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "INTERNAL ERROR! HiprandDevice is not supported because this application was built without Hiprand support" ); // INTERNAL ERROR (no path to this statement) +#elif defined __HIPCC__ + const bool onDevice = true; + prnk.reset( new HiprandRandomNumberKernel( devRndmom, onDevice ) ); +#else + throw std::logic_error( "INTERNAL ERROR! HiprandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) +#endif + } + else + throw std::logic_error( "INTERNAL ERROR! Unknown rndgen value?" ); // INTERNAL ERROR (no path to this statement) // --- 0c. Create rambo sampling kernel [keep this in 0c for the moment] std::unique_ptr prsk; @@ -497,7 +567,7 @@ main( int argc, char** argv ) // *** START THE OLD-STYLE TIMER FOR RANDOM GEN *** double genrtime = 0; - // --- 1a. Seed rnd generator (to get same results on host and device in curand) + // --- 1a. Seed rnd generator (to get same results on host and device in curand/hiprand) // [NB This should not be necessary using the host API: "Generation functions // can be called multiple times on the same generator to generate successive // blocks of results. For pseudorandom generators, multiple calls to generation @@ -515,7 +585,9 @@ main( int argc, char** argv ) //std::cout << "Got random numbers" << std::endl; #ifdef MGONGPUCPP_GPUIMPL - if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) + if( rndgen != RandomNumberMode::CurandDevice && + rndgen != RandomNumberMode::HiprandDevice && + rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device const std::string htodKey = "1c CpHTDrnd"; @@ -761,6 +833,10 @@ main( int argc, char** argv ) rndgentxt = "CURAND HOST"; else if( rndgen == RandomNumberMode::CurandDevice ) rndgentxt = "CURAND DEVICE"; + else if( rndgen == RandomNumberMode::HiprandHost ) + rndgentxt = "ROCRAND HOST"; + else if( rndgen == RandomNumberMode::HiprandDevice ) + rndgentxt = "ROCRAND DEVICE"; #ifdef __CUDACC__ rndgentxt += " (CUDA code)"; #elif defined __HIPCC__ @@ -788,7 +864,7 @@ main( int argc, char** argv ) wrkflwtxt += "FLT+"; #else wrkflwtxt += "???+"; // no path to this statement -#endif /* clang-format on */ +#endif // -- CUCOMPLEX or THRUST or STD or CXSIMPLE complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -813,7 +889,7 @@ main( int argc, char** argv ) wrkflwtxt += "CXS:"; #else wrkflwtxt += "???:"; // no path to this statement -#endif +#endif /* clang-format on */ #endif // -- COMMON or CURAND HOST or CURAND DEVICE random numbers? if( rndgen == RandomNumberMode::CommonRandom ) @@ -822,6 +898,10 @@ main( int argc, char** argv ) wrkflwtxt += "CURHST+"; else if( rndgen == RandomNumberMode::CurandDevice ) wrkflwtxt += "CURDEV+"; + else if( rndgen == RandomNumberMode::HiprandHost ) + wrkflwtxt += "HIRHST+"; + else if( rndgen == RandomNumberMode::HiprandDevice ) + wrkflwtxt += "HIRDEV+"; else wrkflwtxt += "??????+"; // no path to this statement // -- HOST or DEVICE rambo sampling? @@ -1099,7 +1179,7 @@ main( int argc, char** argv ) #ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif - << "\"Curand generation\": " + << "\"Random generation\": " << "\"" << rndgentxt << "\"," << std::endl; double minelem = hstStats.minME; diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/HiprandRandomNumberKernel.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/HiprandRandomNumberKernel.cc new file mode 120000 index 0000000000..6691864f78 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/HiprandRandomNumberKernel.cc @@ -0,0 +1 @@ +../HiprandRandomNumberKernel.cc \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/check_sa.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/check_sa.cc index bde384c69e..e086ae12d9 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/check_sa.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/check_sa.cc @@ -1,10 +1,10 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2023 CERN and UCLouvain. +// Copyright (C) 2020-2024 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -58,7 +58,7 @@ int usage( char* argv0, int ret = 1 ) { std::cout << "Usage: " << argv0 - << " [--verbose|-v] [--debug|-d] [--performance|-p] [--json|-j] [--curhst|--curdev|--common] [--rmbhst|--rmbdev] [--bridge]" + << " [--verbose|-v] [--debug|-d] [--performance|-p] [--json|-j] [--curhst|--curdev|--hirhst|--hirdev|--common] [--rmbhst|--rmbdev] [--bridge]" << " [#gpuBlocksPerGrid #gpuThreadsPerBlock] #iterations" << std::endl; std::cout << std::endl; std::cout << "The number of events per iteration is #gpuBlocksPerGrid * #gpuThreadsPerBlock" << std::endl; @@ -131,17 +131,31 @@ main( int argc, char** argv ) enum class RandomNumberMode { CommonRandom = 0, - CurandHost = 1, - CurandDevice = 2 + CurandHost = -1, + CurandDevice = 1, + HiprandHost = -2, + HiprandDevice = 2 }; -#ifdef MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785) -#elif defined __HIPCC__ -#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random -#elif defined __CUDACC__ +#if defined __CUDACC__ +#ifndef MGONGPU_HAS_NO_CURAND RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand #else + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on NVidia GPU if build has no curand (PR #784 and #785) +#endif +#elif defined __HIPCC__ +#ifndef MGONGPU_HAS_NO_HIPRAND + RandomNumberMode rndgen = RandomNumberMode::HiprandDevice; // default on AMD GPU if build has hiprand +#else + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on AMD GPU if build has no hiprand +#endif +#else +#ifndef MGONGPU_HAS_NO_CURAND RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand +#elif not defined MGONGPU_HAS_NO_HIPRAND + RandomNumberMode rndgen = RandomNumberMode::HiprandDevice; // default on CPU if build has hiprand +#else + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has neither curand nor hiprand +#endif #endif // Rambo sampling mode (NB RamboHost implies CommonRandom or CurandHost!) enum class RamboSamplingMode @@ -152,7 +166,7 @@ main( int argc, char** argv ) #ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -193,6 +207,26 @@ main( int argc, char** argv ) throw std::runtime_error( "CurandHost is not supported because this application was built without Curand support" ); #else rndgen = RandomNumberMode::CurandHost; +#endif + } + else if( arg == "--hirdev" ) + { +#ifndef __HIPCC__ + throw std::runtime_error( "HiprandDevice is not supported on CPUs or non-AMD GPUs" ); +#elif defined MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "HiprandDevice is not supported because this application was built without Hiprand support" ); +#else + rndgen = RandomNumberMode::HiprandDevice; +#endif + } + else if( arg == "--hirhst" ) + { +#ifdef MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "HiprandHost is not supported because this application was built without Hiprand support" ); +#else + // See https://github.com/ROCm/hipRAND/issues/76 + throw std::runtime_error( "HiprandRandomNumberKernel on host is not supported yet (hiprandCreateGeneratorHost is not implemented yet)" ); + //rndgen = RandomNumberMode::HiprandHost; #endif } else if( arg == "--common" ) @@ -265,6 +299,20 @@ main( int argc, char** argv ) #endif } + if( rmbsmp == RamboSamplingMode::RamboHost && rndgen == RandomNumberMode::HiprandDevice ) + { +#if not defined MGONGPU_HAS_NO_HIPRAND + // See https://github.com/ROCm/hipRAND/issues/76 + //std::cout << "WARNING! RamboHost selected: cannot use HiprandDevice, will use HiprandHost" << std::endl; + //rndgen = RandomNumberMode::HiprandHost; + std::cout << "WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet)" << std::endl; + rndgen = RandomNumberMode::CommonRandom; +#else + std::cout << "WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom" << std::endl; + rndgen = RandomNumberMode::CommonRandom; +#endif + } + constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout constexpr int neppR = MemoryAccessRandomNumbers::neppR; // AOSOA layout @@ -415,7 +463,7 @@ main( int argc, char** argv ) std::unique_ptr wavetimes( new double[niter] ); std::unique_ptr wv3atimes( new double[niter] ); - // --- 0c. Create curand or common generator + // --- 0c. Create curand, hiprand or common generator const std::string cgenKey = "0c GenCreat"; timermap.start( cgenKey ); // Allocate the appropriate RandomNumberKernel @@ -433,7 +481,7 @@ main( int argc, char** argv ) prnk.reset( new CurandRandomNumberKernel( hstRndmom, onDevice ) ); #endif } - else + else if( rndgen == RandomNumberMode::CurandDevice ) { #ifdef MGONGPU_HAS_NO_CURAND throw std::runtime_error( "INTERNAL ERROR! CurandDevice is not supported because this application was built without Curand support" ); // INTERNAL ERROR (no path to this statement) @@ -441,9 +489,31 @@ main( int argc, char** argv ) const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); #else - throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif } + else if( rndgen == RandomNumberMode::HiprandHost ) + { +#ifdef MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "INTERNAL ERROR! HiprandHost is not supported because this application was built without Hiprand support" ); // INTERNAL ERROR (no path to this statement) +#else + const bool onDevice = false; + prnk.reset( new HiprandRandomNumberKernel( hstRndmom, onDevice ) ); +#endif + } + else if( rndgen == RandomNumberMode::HiprandDevice ) + { +#ifdef MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "INTERNAL ERROR! HiprandDevice is not supported because this application was built without Hiprand support" ); // INTERNAL ERROR (no path to this statement) +#elif defined __HIPCC__ + const bool onDevice = true; + prnk.reset( new HiprandRandomNumberKernel( devRndmom, onDevice ) ); +#else + throw std::logic_error( "INTERNAL ERROR! HiprandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) +#endif + } + else + throw std::logic_error( "INTERNAL ERROR! Unknown rndgen value?" ); // INTERNAL ERROR (no path to this statement) // --- 0c. Create rambo sampling kernel [keep this in 0c for the moment] std::unique_ptr prsk; @@ -497,7 +567,7 @@ main( int argc, char** argv ) // *** START THE OLD-STYLE TIMER FOR RANDOM GEN *** double genrtime = 0; - // --- 1a. Seed rnd generator (to get same results on host and device in curand) + // --- 1a. Seed rnd generator (to get same results on host and device in curand/hiprand) // [NB This should not be necessary using the host API: "Generation functions // can be called multiple times on the same generator to generate successive // blocks of results. For pseudorandom generators, multiple calls to generation @@ -515,7 +585,9 @@ main( int argc, char** argv ) //std::cout << "Got random numbers" << std::endl; #ifdef MGONGPUCPP_GPUIMPL - if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) + if( rndgen != RandomNumberMode::CurandDevice && + rndgen != RandomNumberMode::HiprandDevice && + rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device const std::string htodKey = "1c CpHTDrnd"; @@ -761,6 +833,10 @@ main( int argc, char** argv ) rndgentxt = "CURAND HOST"; else if( rndgen == RandomNumberMode::CurandDevice ) rndgentxt = "CURAND DEVICE"; + else if( rndgen == RandomNumberMode::HiprandHost ) + rndgentxt = "ROCRAND HOST"; + else if( rndgen == RandomNumberMode::HiprandDevice ) + rndgentxt = "ROCRAND DEVICE"; #ifdef __CUDACC__ rndgentxt += " (CUDA code)"; #elif defined __HIPCC__ @@ -788,7 +864,7 @@ main( int argc, char** argv ) wrkflwtxt += "FLT+"; #else wrkflwtxt += "???+"; // no path to this statement -#endif /* clang-format on */ +#endif // -- CUCOMPLEX or THRUST or STD or CXSIMPLE complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -813,7 +889,7 @@ main( int argc, char** argv ) wrkflwtxt += "CXS:"; #else wrkflwtxt += "???:"; // no path to this statement -#endif +#endif /* clang-format on */ #endif // -- COMMON or CURAND HOST or CURAND DEVICE random numbers? if( rndgen == RandomNumberMode::CommonRandom ) @@ -822,6 +898,10 @@ main( int argc, char** argv ) wrkflwtxt += "CURHST+"; else if( rndgen == RandomNumberMode::CurandDevice ) wrkflwtxt += "CURDEV+"; + else if( rndgen == RandomNumberMode::HiprandHost ) + wrkflwtxt += "HIRHST+"; + else if( rndgen == RandomNumberMode::HiprandDevice ) + wrkflwtxt += "HIRDEV+"; else wrkflwtxt += "??????+"; // no path to this statement // -- HOST or DEVICE rambo sampling? @@ -1099,7 +1179,7 @@ main( int argc, char** argv ) #ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif - << "\"Curand generation\": " + << "\"Random generation\": " << "\"" << rndgentxt << "\"," << std::endl; double minelem = hstStats.minME; diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/HiprandRandomNumberKernel.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/HiprandRandomNumberKernel.cc new file mode 120000 index 0000000000..6691864f78 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/HiprandRandomNumberKernel.cc @@ -0,0 +1 @@ +../HiprandRandomNumberKernel.cc \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/check_sa.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/check_sa.cc index bde384c69e..e086ae12d9 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/check_sa.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/check_sa.cc @@ -1,10 +1,10 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2023 CERN and UCLouvain. +// Copyright (C) 2020-2024 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -58,7 +58,7 @@ int usage( char* argv0, int ret = 1 ) { std::cout << "Usage: " << argv0 - << " [--verbose|-v] [--debug|-d] [--performance|-p] [--json|-j] [--curhst|--curdev|--common] [--rmbhst|--rmbdev] [--bridge]" + << " [--verbose|-v] [--debug|-d] [--performance|-p] [--json|-j] [--curhst|--curdev|--hirhst|--hirdev|--common] [--rmbhst|--rmbdev] [--bridge]" << " [#gpuBlocksPerGrid #gpuThreadsPerBlock] #iterations" << std::endl; std::cout << std::endl; std::cout << "The number of events per iteration is #gpuBlocksPerGrid * #gpuThreadsPerBlock" << std::endl; @@ -131,17 +131,31 @@ main( int argc, char** argv ) enum class RandomNumberMode { CommonRandom = 0, - CurandHost = 1, - CurandDevice = 2 + CurandHost = -1, + CurandDevice = 1, + HiprandHost = -2, + HiprandDevice = 2 }; -#ifdef MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785) -#elif defined __HIPCC__ -#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random -#elif defined __CUDACC__ +#if defined __CUDACC__ +#ifndef MGONGPU_HAS_NO_CURAND RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand #else + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on NVidia GPU if build has no curand (PR #784 and #785) +#endif +#elif defined __HIPCC__ +#ifndef MGONGPU_HAS_NO_HIPRAND + RandomNumberMode rndgen = RandomNumberMode::HiprandDevice; // default on AMD GPU if build has hiprand +#else + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on AMD GPU if build has no hiprand +#endif +#else +#ifndef MGONGPU_HAS_NO_CURAND RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand +#elif not defined MGONGPU_HAS_NO_HIPRAND + RandomNumberMode rndgen = RandomNumberMode::HiprandDevice; // default on CPU if build has hiprand +#else + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has neither curand nor hiprand +#endif #endif // Rambo sampling mode (NB RamboHost implies CommonRandom or CurandHost!) enum class RamboSamplingMode @@ -152,7 +166,7 @@ main( int argc, char** argv ) #ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -193,6 +207,26 @@ main( int argc, char** argv ) throw std::runtime_error( "CurandHost is not supported because this application was built without Curand support" ); #else rndgen = RandomNumberMode::CurandHost; +#endif + } + else if( arg == "--hirdev" ) + { +#ifndef __HIPCC__ + throw std::runtime_error( "HiprandDevice is not supported on CPUs or non-AMD GPUs" ); +#elif defined MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "HiprandDevice is not supported because this application was built without Hiprand support" ); +#else + rndgen = RandomNumberMode::HiprandDevice; +#endif + } + else if( arg == "--hirhst" ) + { +#ifdef MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "HiprandHost is not supported because this application was built without Hiprand support" ); +#else + // See https://github.com/ROCm/hipRAND/issues/76 + throw std::runtime_error( "HiprandRandomNumberKernel on host is not supported yet (hiprandCreateGeneratorHost is not implemented yet)" ); + //rndgen = RandomNumberMode::HiprandHost; #endif } else if( arg == "--common" ) @@ -265,6 +299,20 @@ main( int argc, char** argv ) #endif } + if( rmbsmp == RamboSamplingMode::RamboHost && rndgen == RandomNumberMode::HiprandDevice ) + { +#if not defined MGONGPU_HAS_NO_HIPRAND + // See https://github.com/ROCm/hipRAND/issues/76 + //std::cout << "WARNING! RamboHost selected: cannot use HiprandDevice, will use HiprandHost" << std::endl; + //rndgen = RandomNumberMode::HiprandHost; + std::cout << "WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet)" << std::endl; + rndgen = RandomNumberMode::CommonRandom; +#else + std::cout << "WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom" << std::endl; + rndgen = RandomNumberMode::CommonRandom; +#endif + } + constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout constexpr int neppR = MemoryAccessRandomNumbers::neppR; // AOSOA layout @@ -415,7 +463,7 @@ main( int argc, char** argv ) std::unique_ptr wavetimes( new double[niter] ); std::unique_ptr wv3atimes( new double[niter] ); - // --- 0c. Create curand or common generator + // --- 0c. Create curand, hiprand or common generator const std::string cgenKey = "0c GenCreat"; timermap.start( cgenKey ); // Allocate the appropriate RandomNumberKernel @@ -433,7 +481,7 @@ main( int argc, char** argv ) prnk.reset( new CurandRandomNumberKernel( hstRndmom, onDevice ) ); #endif } - else + else if( rndgen == RandomNumberMode::CurandDevice ) { #ifdef MGONGPU_HAS_NO_CURAND throw std::runtime_error( "INTERNAL ERROR! CurandDevice is not supported because this application was built without Curand support" ); // INTERNAL ERROR (no path to this statement) @@ -441,9 +489,31 @@ main( int argc, char** argv ) const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); #else - throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif } + else if( rndgen == RandomNumberMode::HiprandHost ) + { +#ifdef MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "INTERNAL ERROR! HiprandHost is not supported because this application was built without Hiprand support" ); // INTERNAL ERROR (no path to this statement) +#else + const bool onDevice = false; + prnk.reset( new HiprandRandomNumberKernel( hstRndmom, onDevice ) ); +#endif + } + else if( rndgen == RandomNumberMode::HiprandDevice ) + { +#ifdef MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "INTERNAL ERROR! HiprandDevice is not supported because this application was built without Hiprand support" ); // INTERNAL ERROR (no path to this statement) +#elif defined __HIPCC__ + const bool onDevice = true; + prnk.reset( new HiprandRandomNumberKernel( devRndmom, onDevice ) ); +#else + throw std::logic_error( "INTERNAL ERROR! HiprandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) +#endif + } + else + throw std::logic_error( "INTERNAL ERROR! Unknown rndgen value?" ); // INTERNAL ERROR (no path to this statement) // --- 0c. Create rambo sampling kernel [keep this in 0c for the moment] std::unique_ptr prsk; @@ -497,7 +567,7 @@ main( int argc, char** argv ) // *** START THE OLD-STYLE TIMER FOR RANDOM GEN *** double genrtime = 0; - // --- 1a. Seed rnd generator (to get same results on host and device in curand) + // --- 1a. Seed rnd generator (to get same results on host and device in curand/hiprand) // [NB This should not be necessary using the host API: "Generation functions // can be called multiple times on the same generator to generate successive // blocks of results. For pseudorandom generators, multiple calls to generation @@ -515,7 +585,9 @@ main( int argc, char** argv ) //std::cout << "Got random numbers" << std::endl; #ifdef MGONGPUCPP_GPUIMPL - if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) + if( rndgen != RandomNumberMode::CurandDevice && + rndgen != RandomNumberMode::HiprandDevice && + rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device const std::string htodKey = "1c CpHTDrnd"; @@ -761,6 +833,10 @@ main( int argc, char** argv ) rndgentxt = "CURAND HOST"; else if( rndgen == RandomNumberMode::CurandDevice ) rndgentxt = "CURAND DEVICE"; + else if( rndgen == RandomNumberMode::HiprandHost ) + rndgentxt = "ROCRAND HOST"; + else if( rndgen == RandomNumberMode::HiprandDevice ) + rndgentxt = "ROCRAND DEVICE"; #ifdef __CUDACC__ rndgentxt += " (CUDA code)"; #elif defined __HIPCC__ @@ -788,7 +864,7 @@ main( int argc, char** argv ) wrkflwtxt += "FLT+"; #else wrkflwtxt += "???+"; // no path to this statement -#endif /* clang-format on */ +#endif // -- CUCOMPLEX or THRUST or STD or CXSIMPLE complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -813,7 +889,7 @@ main( int argc, char** argv ) wrkflwtxt += "CXS:"; #else wrkflwtxt += "???:"; // no path to this statement -#endif +#endif /* clang-format on */ #endif // -- COMMON or CURAND HOST or CURAND DEVICE random numbers? if( rndgen == RandomNumberMode::CommonRandom ) @@ -822,6 +898,10 @@ main( int argc, char** argv ) wrkflwtxt += "CURHST+"; else if( rndgen == RandomNumberMode::CurandDevice ) wrkflwtxt += "CURDEV+"; + else if( rndgen == RandomNumberMode::HiprandHost ) + wrkflwtxt += "HIRHST+"; + else if( rndgen == RandomNumberMode::HiprandDevice ) + wrkflwtxt += "HIRDEV+"; else wrkflwtxt += "??????+"; // no path to this statement // -- HOST or DEVICE rambo sampling? @@ -1099,7 +1179,7 @@ main( int argc, char** argv ) #ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif - << "\"Curand generation\": " + << "\"Random generation\": " << "\"" << rndgentxt << "\"," << std::endl; double minelem = hstStats.minME; diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/HiprandRandomNumberKernel.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/HiprandRandomNumberKernel.cc new file mode 120000 index 0000000000..6691864f78 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/HiprandRandomNumberKernel.cc @@ -0,0 +1 @@ +../HiprandRandomNumberKernel.cc \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/check_sa.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/check_sa.cc index bde384c69e..e086ae12d9 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/check_sa.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/check_sa.cc @@ -1,10 +1,10 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2023 CERN and UCLouvain. +// Copyright (C) 2020-2024 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -58,7 +58,7 @@ int usage( char* argv0, int ret = 1 ) { std::cout << "Usage: " << argv0 - << " [--verbose|-v] [--debug|-d] [--performance|-p] [--json|-j] [--curhst|--curdev|--common] [--rmbhst|--rmbdev] [--bridge]" + << " [--verbose|-v] [--debug|-d] [--performance|-p] [--json|-j] [--curhst|--curdev|--hirhst|--hirdev|--common] [--rmbhst|--rmbdev] [--bridge]" << " [#gpuBlocksPerGrid #gpuThreadsPerBlock] #iterations" << std::endl; std::cout << std::endl; std::cout << "The number of events per iteration is #gpuBlocksPerGrid * #gpuThreadsPerBlock" << std::endl; @@ -131,17 +131,31 @@ main( int argc, char** argv ) enum class RandomNumberMode { CommonRandom = 0, - CurandHost = 1, - CurandDevice = 2 + CurandHost = -1, + CurandDevice = 1, + HiprandHost = -2, + HiprandDevice = 2 }; -#ifdef MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785) -#elif defined __HIPCC__ -#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random -#elif defined __CUDACC__ +#if defined __CUDACC__ +#ifndef MGONGPU_HAS_NO_CURAND RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand #else + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on NVidia GPU if build has no curand (PR #784 and #785) +#endif +#elif defined __HIPCC__ +#ifndef MGONGPU_HAS_NO_HIPRAND + RandomNumberMode rndgen = RandomNumberMode::HiprandDevice; // default on AMD GPU if build has hiprand +#else + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on AMD GPU if build has no hiprand +#endif +#else +#ifndef MGONGPU_HAS_NO_CURAND RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand +#elif not defined MGONGPU_HAS_NO_HIPRAND + RandomNumberMode rndgen = RandomNumberMode::HiprandDevice; // default on CPU if build has hiprand +#else + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has neither curand nor hiprand +#endif #endif // Rambo sampling mode (NB RamboHost implies CommonRandom or CurandHost!) enum class RamboSamplingMode @@ -152,7 +166,7 @@ main( int argc, char** argv ) #ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -193,6 +207,26 @@ main( int argc, char** argv ) throw std::runtime_error( "CurandHost is not supported because this application was built without Curand support" ); #else rndgen = RandomNumberMode::CurandHost; +#endif + } + else if( arg == "--hirdev" ) + { +#ifndef __HIPCC__ + throw std::runtime_error( "HiprandDevice is not supported on CPUs or non-AMD GPUs" ); +#elif defined MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "HiprandDevice is not supported because this application was built without Hiprand support" ); +#else + rndgen = RandomNumberMode::HiprandDevice; +#endif + } + else if( arg == "--hirhst" ) + { +#ifdef MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "HiprandHost is not supported because this application was built without Hiprand support" ); +#else + // See https://github.com/ROCm/hipRAND/issues/76 + throw std::runtime_error( "HiprandRandomNumberKernel on host is not supported yet (hiprandCreateGeneratorHost is not implemented yet)" ); + //rndgen = RandomNumberMode::HiprandHost; #endif } else if( arg == "--common" ) @@ -265,6 +299,20 @@ main( int argc, char** argv ) #endif } + if( rmbsmp == RamboSamplingMode::RamboHost && rndgen == RandomNumberMode::HiprandDevice ) + { +#if not defined MGONGPU_HAS_NO_HIPRAND + // See https://github.com/ROCm/hipRAND/issues/76 + //std::cout << "WARNING! RamboHost selected: cannot use HiprandDevice, will use HiprandHost" << std::endl; + //rndgen = RandomNumberMode::HiprandHost; + std::cout << "WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet)" << std::endl; + rndgen = RandomNumberMode::CommonRandom; +#else + std::cout << "WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom" << std::endl; + rndgen = RandomNumberMode::CommonRandom; +#endif + } + constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout constexpr int neppR = MemoryAccessRandomNumbers::neppR; // AOSOA layout @@ -415,7 +463,7 @@ main( int argc, char** argv ) std::unique_ptr wavetimes( new double[niter] ); std::unique_ptr wv3atimes( new double[niter] ); - // --- 0c. Create curand or common generator + // --- 0c. Create curand, hiprand or common generator const std::string cgenKey = "0c GenCreat"; timermap.start( cgenKey ); // Allocate the appropriate RandomNumberKernel @@ -433,7 +481,7 @@ main( int argc, char** argv ) prnk.reset( new CurandRandomNumberKernel( hstRndmom, onDevice ) ); #endif } - else + else if( rndgen == RandomNumberMode::CurandDevice ) { #ifdef MGONGPU_HAS_NO_CURAND throw std::runtime_error( "INTERNAL ERROR! CurandDevice is not supported because this application was built without Curand support" ); // INTERNAL ERROR (no path to this statement) @@ -441,9 +489,31 @@ main( int argc, char** argv ) const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); #else - throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif } + else if( rndgen == RandomNumberMode::HiprandHost ) + { +#ifdef MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "INTERNAL ERROR! HiprandHost is not supported because this application was built without Hiprand support" ); // INTERNAL ERROR (no path to this statement) +#else + const bool onDevice = false; + prnk.reset( new HiprandRandomNumberKernel( hstRndmom, onDevice ) ); +#endif + } + else if( rndgen == RandomNumberMode::HiprandDevice ) + { +#ifdef MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "INTERNAL ERROR! HiprandDevice is not supported because this application was built without Hiprand support" ); // INTERNAL ERROR (no path to this statement) +#elif defined __HIPCC__ + const bool onDevice = true; + prnk.reset( new HiprandRandomNumberKernel( devRndmom, onDevice ) ); +#else + throw std::logic_error( "INTERNAL ERROR! HiprandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) +#endif + } + else + throw std::logic_error( "INTERNAL ERROR! Unknown rndgen value?" ); // INTERNAL ERROR (no path to this statement) // --- 0c. Create rambo sampling kernel [keep this in 0c for the moment] std::unique_ptr prsk; @@ -497,7 +567,7 @@ main( int argc, char** argv ) // *** START THE OLD-STYLE TIMER FOR RANDOM GEN *** double genrtime = 0; - // --- 1a. Seed rnd generator (to get same results on host and device in curand) + // --- 1a. Seed rnd generator (to get same results on host and device in curand/hiprand) // [NB This should not be necessary using the host API: "Generation functions // can be called multiple times on the same generator to generate successive // blocks of results. For pseudorandom generators, multiple calls to generation @@ -515,7 +585,9 @@ main( int argc, char** argv ) //std::cout << "Got random numbers" << std::endl; #ifdef MGONGPUCPP_GPUIMPL - if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) + if( rndgen != RandomNumberMode::CurandDevice && + rndgen != RandomNumberMode::HiprandDevice && + rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device const std::string htodKey = "1c CpHTDrnd"; @@ -761,6 +833,10 @@ main( int argc, char** argv ) rndgentxt = "CURAND HOST"; else if( rndgen == RandomNumberMode::CurandDevice ) rndgentxt = "CURAND DEVICE"; + else if( rndgen == RandomNumberMode::HiprandHost ) + rndgentxt = "ROCRAND HOST"; + else if( rndgen == RandomNumberMode::HiprandDevice ) + rndgentxt = "ROCRAND DEVICE"; #ifdef __CUDACC__ rndgentxt += " (CUDA code)"; #elif defined __HIPCC__ @@ -788,7 +864,7 @@ main( int argc, char** argv ) wrkflwtxt += "FLT+"; #else wrkflwtxt += "???+"; // no path to this statement -#endif /* clang-format on */ +#endif // -- CUCOMPLEX or THRUST or STD or CXSIMPLE complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -813,7 +889,7 @@ main( int argc, char** argv ) wrkflwtxt += "CXS:"; #else wrkflwtxt += "???:"; // no path to this statement -#endif +#endif /* clang-format on */ #endif // -- COMMON or CURAND HOST or CURAND DEVICE random numbers? if( rndgen == RandomNumberMode::CommonRandom ) @@ -822,6 +898,10 @@ main( int argc, char** argv ) wrkflwtxt += "CURHST+"; else if( rndgen == RandomNumberMode::CurandDevice ) wrkflwtxt += "CURDEV+"; + else if( rndgen == RandomNumberMode::HiprandHost ) + wrkflwtxt += "HIRHST+"; + else if( rndgen == RandomNumberMode::HiprandDevice ) + wrkflwtxt += "HIRDEV+"; else wrkflwtxt += "??????+"; // no path to this statement // -- HOST or DEVICE rambo sampling? @@ -1099,7 +1179,7 @@ main( int argc, char** argv ) #ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif - << "\"Curand generation\": " + << "\"Random generation\": " << "\"" << rndgentxt << "\"," << std::endl; double minelem = hstStats.minME; diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/HiprandRandomNumberKernel.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/HiprandRandomNumberKernel.cc new file mode 120000 index 0000000000..6691864f78 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/HiprandRandomNumberKernel.cc @@ -0,0 +1 @@ +../HiprandRandomNumberKernel.cc \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/check_sa.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/check_sa.cc index bde384c69e..e086ae12d9 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/check_sa.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/check_sa.cc @@ -1,10 +1,10 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2023 CERN and UCLouvain. +// Copyright (C) 2020-2024 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -58,7 +58,7 @@ int usage( char* argv0, int ret = 1 ) { std::cout << "Usage: " << argv0 - << " [--verbose|-v] [--debug|-d] [--performance|-p] [--json|-j] [--curhst|--curdev|--common] [--rmbhst|--rmbdev] [--bridge]" + << " [--verbose|-v] [--debug|-d] [--performance|-p] [--json|-j] [--curhst|--curdev|--hirhst|--hirdev|--common] [--rmbhst|--rmbdev] [--bridge]" << " [#gpuBlocksPerGrid #gpuThreadsPerBlock] #iterations" << std::endl; std::cout << std::endl; std::cout << "The number of events per iteration is #gpuBlocksPerGrid * #gpuThreadsPerBlock" << std::endl; @@ -131,17 +131,31 @@ main( int argc, char** argv ) enum class RandomNumberMode { CommonRandom = 0, - CurandHost = 1, - CurandDevice = 2 + CurandHost = -1, + CurandDevice = 1, + HiprandHost = -2, + HiprandDevice = 2 }; -#ifdef MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785) -#elif defined __HIPCC__ -#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random -#elif defined __CUDACC__ +#if defined __CUDACC__ +#ifndef MGONGPU_HAS_NO_CURAND RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand #else + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on NVidia GPU if build has no curand (PR #784 and #785) +#endif +#elif defined __HIPCC__ +#ifndef MGONGPU_HAS_NO_HIPRAND + RandomNumberMode rndgen = RandomNumberMode::HiprandDevice; // default on AMD GPU if build has hiprand +#else + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on AMD GPU if build has no hiprand +#endif +#else +#ifndef MGONGPU_HAS_NO_CURAND RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand +#elif not defined MGONGPU_HAS_NO_HIPRAND + RandomNumberMode rndgen = RandomNumberMode::HiprandDevice; // default on CPU if build has hiprand +#else + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has neither curand nor hiprand +#endif #endif // Rambo sampling mode (NB RamboHost implies CommonRandom or CurandHost!) enum class RamboSamplingMode @@ -152,7 +166,7 @@ main( int argc, char** argv ) #ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -193,6 +207,26 @@ main( int argc, char** argv ) throw std::runtime_error( "CurandHost is not supported because this application was built without Curand support" ); #else rndgen = RandomNumberMode::CurandHost; +#endif + } + else if( arg == "--hirdev" ) + { +#ifndef __HIPCC__ + throw std::runtime_error( "HiprandDevice is not supported on CPUs or non-AMD GPUs" ); +#elif defined MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "HiprandDevice is not supported because this application was built without Hiprand support" ); +#else + rndgen = RandomNumberMode::HiprandDevice; +#endif + } + else if( arg == "--hirhst" ) + { +#ifdef MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "HiprandHost is not supported because this application was built without Hiprand support" ); +#else + // See https://github.com/ROCm/hipRAND/issues/76 + throw std::runtime_error( "HiprandRandomNumberKernel on host is not supported yet (hiprandCreateGeneratorHost is not implemented yet)" ); + //rndgen = RandomNumberMode::HiprandHost; #endif } else if( arg == "--common" ) @@ -265,6 +299,20 @@ main( int argc, char** argv ) #endif } + if( rmbsmp == RamboSamplingMode::RamboHost && rndgen == RandomNumberMode::HiprandDevice ) + { +#if not defined MGONGPU_HAS_NO_HIPRAND + // See https://github.com/ROCm/hipRAND/issues/76 + //std::cout << "WARNING! RamboHost selected: cannot use HiprandDevice, will use HiprandHost" << std::endl; + //rndgen = RandomNumberMode::HiprandHost; + std::cout << "WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet)" << std::endl; + rndgen = RandomNumberMode::CommonRandom; +#else + std::cout << "WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom" << std::endl; + rndgen = RandomNumberMode::CommonRandom; +#endif + } + constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout constexpr int neppR = MemoryAccessRandomNumbers::neppR; // AOSOA layout @@ -415,7 +463,7 @@ main( int argc, char** argv ) std::unique_ptr wavetimes( new double[niter] ); std::unique_ptr wv3atimes( new double[niter] ); - // --- 0c. Create curand or common generator + // --- 0c. Create curand, hiprand or common generator const std::string cgenKey = "0c GenCreat"; timermap.start( cgenKey ); // Allocate the appropriate RandomNumberKernel @@ -433,7 +481,7 @@ main( int argc, char** argv ) prnk.reset( new CurandRandomNumberKernel( hstRndmom, onDevice ) ); #endif } - else + else if( rndgen == RandomNumberMode::CurandDevice ) { #ifdef MGONGPU_HAS_NO_CURAND throw std::runtime_error( "INTERNAL ERROR! CurandDevice is not supported because this application was built without Curand support" ); // INTERNAL ERROR (no path to this statement) @@ -441,9 +489,31 @@ main( int argc, char** argv ) const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); #else - throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif } + else if( rndgen == RandomNumberMode::HiprandHost ) + { +#ifdef MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "INTERNAL ERROR! HiprandHost is not supported because this application was built without Hiprand support" ); // INTERNAL ERROR (no path to this statement) +#else + const bool onDevice = false; + prnk.reset( new HiprandRandomNumberKernel( hstRndmom, onDevice ) ); +#endif + } + else if( rndgen == RandomNumberMode::HiprandDevice ) + { +#ifdef MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "INTERNAL ERROR! HiprandDevice is not supported because this application was built without Hiprand support" ); // INTERNAL ERROR (no path to this statement) +#elif defined __HIPCC__ + const bool onDevice = true; + prnk.reset( new HiprandRandomNumberKernel( devRndmom, onDevice ) ); +#else + throw std::logic_error( "INTERNAL ERROR! HiprandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) +#endif + } + else + throw std::logic_error( "INTERNAL ERROR! Unknown rndgen value?" ); // INTERNAL ERROR (no path to this statement) // --- 0c. Create rambo sampling kernel [keep this in 0c for the moment] std::unique_ptr prsk; @@ -497,7 +567,7 @@ main( int argc, char** argv ) // *** START THE OLD-STYLE TIMER FOR RANDOM GEN *** double genrtime = 0; - // --- 1a. Seed rnd generator (to get same results on host and device in curand) + // --- 1a. Seed rnd generator (to get same results on host and device in curand/hiprand) // [NB This should not be necessary using the host API: "Generation functions // can be called multiple times on the same generator to generate successive // blocks of results. For pseudorandom generators, multiple calls to generation @@ -515,7 +585,9 @@ main( int argc, char** argv ) //std::cout << "Got random numbers" << std::endl; #ifdef MGONGPUCPP_GPUIMPL - if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) + if( rndgen != RandomNumberMode::CurandDevice && + rndgen != RandomNumberMode::HiprandDevice && + rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device const std::string htodKey = "1c CpHTDrnd"; @@ -761,6 +833,10 @@ main( int argc, char** argv ) rndgentxt = "CURAND HOST"; else if( rndgen == RandomNumberMode::CurandDevice ) rndgentxt = "CURAND DEVICE"; + else if( rndgen == RandomNumberMode::HiprandHost ) + rndgentxt = "ROCRAND HOST"; + else if( rndgen == RandomNumberMode::HiprandDevice ) + rndgentxt = "ROCRAND DEVICE"; #ifdef __CUDACC__ rndgentxt += " (CUDA code)"; #elif defined __HIPCC__ @@ -788,7 +864,7 @@ main( int argc, char** argv ) wrkflwtxt += "FLT+"; #else wrkflwtxt += "???+"; // no path to this statement -#endif /* clang-format on */ +#endif // -- CUCOMPLEX or THRUST or STD or CXSIMPLE complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -813,7 +889,7 @@ main( int argc, char** argv ) wrkflwtxt += "CXS:"; #else wrkflwtxt += "???:"; // no path to this statement -#endif +#endif /* clang-format on */ #endif // -- COMMON or CURAND HOST or CURAND DEVICE random numbers? if( rndgen == RandomNumberMode::CommonRandom ) @@ -822,6 +898,10 @@ main( int argc, char** argv ) wrkflwtxt += "CURHST+"; else if( rndgen == RandomNumberMode::CurandDevice ) wrkflwtxt += "CURDEV+"; + else if( rndgen == RandomNumberMode::HiprandHost ) + wrkflwtxt += "HIRHST+"; + else if( rndgen == RandomNumberMode::HiprandDevice ) + wrkflwtxt += "HIRDEV+"; else wrkflwtxt += "??????+"; // no path to this statement // -- HOST or DEVICE rambo sampling? @@ -1099,7 +1179,7 @@ main( int argc, char** argv ) #ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif - << "\"Curand generation\": " + << "\"Random generation\": " << "\"" << rndgentxt << "\"," << std::endl; double minelem = hstStats.minME; diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/HiprandRandomNumberKernel.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/HiprandRandomNumberKernel.cc new file mode 120000 index 0000000000..6691864f78 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/HiprandRandomNumberKernel.cc @@ -0,0 +1 @@ +../HiprandRandomNumberKernel.cc \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/check_sa.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/check_sa.cc index bde384c69e..e086ae12d9 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/check_sa.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/check_sa.cc @@ -1,10 +1,10 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2023 CERN and UCLouvain. +// Copyright (C) 2020-2024 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -58,7 +58,7 @@ int usage( char* argv0, int ret = 1 ) { std::cout << "Usage: " << argv0 - << " [--verbose|-v] [--debug|-d] [--performance|-p] [--json|-j] [--curhst|--curdev|--common] [--rmbhst|--rmbdev] [--bridge]" + << " [--verbose|-v] [--debug|-d] [--performance|-p] [--json|-j] [--curhst|--curdev|--hirhst|--hirdev|--common] [--rmbhst|--rmbdev] [--bridge]" << " [#gpuBlocksPerGrid #gpuThreadsPerBlock] #iterations" << std::endl; std::cout << std::endl; std::cout << "The number of events per iteration is #gpuBlocksPerGrid * #gpuThreadsPerBlock" << std::endl; @@ -131,17 +131,31 @@ main( int argc, char** argv ) enum class RandomNumberMode { CommonRandom = 0, - CurandHost = 1, - CurandDevice = 2 + CurandHost = -1, + CurandDevice = 1, + HiprandHost = -2, + HiprandDevice = 2 }; -#ifdef MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785) -#elif defined __HIPCC__ -#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random -#elif defined __CUDACC__ +#if defined __CUDACC__ +#ifndef MGONGPU_HAS_NO_CURAND RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand #else + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on NVidia GPU if build has no curand (PR #784 and #785) +#endif +#elif defined __HIPCC__ +#ifndef MGONGPU_HAS_NO_HIPRAND + RandomNumberMode rndgen = RandomNumberMode::HiprandDevice; // default on AMD GPU if build has hiprand +#else + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on AMD GPU if build has no hiprand +#endif +#else +#ifndef MGONGPU_HAS_NO_CURAND RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand +#elif not defined MGONGPU_HAS_NO_HIPRAND + RandomNumberMode rndgen = RandomNumberMode::HiprandDevice; // default on CPU if build has hiprand +#else + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has neither curand nor hiprand +#endif #endif // Rambo sampling mode (NB RamboHost implies CommonRandom or CurandHost!) enum class RamboSamplingMode @@ -152,7 +166,7 @@ main( int argc, char** argv ) #ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -193,6 +207,26 @@ main( int argc, char** argv ) throw std::runtime_error( "CurandHost is not supported because this application was built without Curand support" ); #else rndgen = RandomNumberMode::CurandHost; +#endif + } + else if( arg == "--hirdev" ) + { +#ifndef __HIPCC__ + throw std::runtime_error( "HiprandDevice is not supported on CPUs or non-AMD GPUs" ); +#elif defined MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "HiprandDevice is not supported because this application was built without Hiprand support" ); +#else + rndgen = RandomNumberMode::HiprandDevice; +#endif + } + else if( arg == "--hirhst" ) + { +#ifdef MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "HiprandHost is not supported because this application was built without Hiprand support" ); +#else + // See https://github.com/ROCm/hipRAND/issues/76 + throw std::runtime_error( "HiprandRandomNumberKernel on host is not supported yet (hiprandCreateGeneratorHost is not implemented yet)" ); + //rndgen = RandomNumberMode::HiprandHost; #endif } else if( arg == "--common" ) @@ -265,6 +299,20 @@ main( int argc, char** argv ) #endif } + if( rmbsmp == RamboSamplingMode::RamboHost && rndgen == RandomNumberMode::HiprandDevice ) + { +#if not defined MGONGPU_HAS_NO_HIPRAND + // See https://github.com/ROCm/hipRAND/issues/76 + //std::cout << "WARNING! RamboHost selected: cannot use HiprandDevice, will use HiprandHost" << std::endl; + //rndgen = RandomNumberMode::HiprandHost; + std::cout << "WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet)" << std::endl; + rndgen = RandomNumberMode::CommonRandom; +#else + std::cout << "WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom" << std::endl; + rndgen = RandomNumberMode::CommonRandom; +#endif + } + constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout constexpr int neppR = MemoryAccessRandomNumbers::neppR; // AOSOA layout @@ -415,7 +463,7 @@ main( int argc, char** argv ) std::unique_ptr wavetimes( new double[niter] ); std::unique_ptr wv3atimes( new double[niter] ); - // --- 0c. Create curand or common generator + // --- 0c. Create curand, hiprand or common generator const std::string cgenKey = "0c GenCreat"; timermap.start( cgenKey ); // Allocate the appropriate RandomNumberKernel @@ -433,7 +481,7 @@ main( int argc, char** argv ) prnk.reset( new CurandRandomNumberKernel( hstRndmom, onDevice ) ); #endif } - else + else if( rndgen == RandomNumberMode::CurandDevice ) { #ifdef MGONGPU_HAS_NO_CURAND throw std::runtime_error( "INTERNAL ERROR! CurandDevice is not supported because this application was built without Curand support" ); // INTERNAL ERROR (no path to this statement) @@ -441,9 +489,31 @@ main( int argc, char** argv ) const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); #else - throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif } + else if( rndgen == RandomNumberMode::HiprandHost ) + { +#ifdef MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "INTERNAL ERROR! HiprandHost is not supported because this application was built without Hiprand support" ); // INTERNAL ERROR (no path to this statement) +#else + const bool onDevice = false; + prnk.reset( new HiprandRandomNumberKernel( hstRndmom, onDevice ) ); +#endif + } + else if( rndgen == RandomNumberMode::HiprandDevice ) + { +#ifdef MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "INTERNAL ERROR! HiprandDevice is not supported because this application was built without Hiprand support" ); // INTERNAL ERROR (no path to this statement) +#elif defined __HIPCC__ + const bool onDevice = true; + prnk.reset( new HiprandRandomNumberKernel( devRndmom, onDevice ) ); +#else + throw std::logic_error( "INTERNAL ERROR! HiprandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) +#endif + } + else + throw std::logic_error( "INTERNAL ERROR! Unknown rndgen value?" ); // INTERNAL ERROR (no path to this statement) // --- 0c. Create rambo sampling kernel [keep this in 0c for the moment] std::unique_ptr prsk; @@ -497,7 +567,7 @@ main( int argc, char** argv ) // *** START THE OLD-STYLE TIMER FOR RANDOM GEN *** double genrtime = 0; - // --- 1a. Seed rnd generator (to get same results on host and device in curand) + // --- 1a. Seed rnd generator (to get same results on host and device in curand/hiprand) // [NB This should not be necessary using the host API: "Generation functions // can be called multiple times on the same generator to generate successive // blocks of results. For pseudorandom generators, multiple calls to generation @@ -515,7 +585,9 @@ main( int argc, char** argv ) //std::cout << "Got random numbers" << std::endl; #ifdef MGONGPUCPP_GPUIMPL - if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) + if( rndgen != RandomNumberMode::CurandDevice && + rndgen != RandomNumberMode::HiprandDevice && + rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device const std::string htodKey = "1c CpHTDrnd"; @@ -761,6 +833,10 @@ main( int argc, char** argv ) rndgentxt = "CURAND HOST"; else if( rndgen == RandomNumberMode::CurandDevice ) rndgentxt = "CURAND DEVICE"; + else if( rndgen == RandomNumberMode::HiprandHost ) + rndgentxt = "ROCRAND HOST"; + else if( rndgen == RandomNumberMode::HiprandDevice ) + rndgentxt = "ROCRAND DEVICE"; #ifdef __CUDACC__ rndgentxt += " (CUDA code)"; #elif defined __HIPCC__ @@ -788,7 +864,7 @@ main( int argc, char** argv ) wrkflwtxt += "FLT+"; #else wrkflwtxt += "???+"; // no path to this statement -#endif /* clang-format on */ +#endif // -- CUCOMPLEX or THRUST or STD or CXSIMPLE complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -813,7 +889,7 @@ main( int argc, char** argv ) wrkflwtxt += "CXS:"; #else wrkflwtxt += "???:"; // no path to this statement -#endif +#endif /* clang-format on */ #endif // -- COMMON or CURAND HOST or CURAND DEVICE random numbers? if( rndgen == RandomNumberMode::CommonRandom ) @@ -822,6 +898,10 @@ main( int argc, char** argv ) wrkflwtxt += "CURHST+"; else if( rndgen == RandomNumberMode::CurandDevice ) wrkflwtxt += "CURDEV+"; + else if( rndgen == RandomNumberMode::HiprandHost ) + wrkflwtxt += "HIRHST+"; + else if( rndgen == RandomNumberMode::HiprandDevice ) + wrkflwtxt += "HIRDEV+"; else wrkflwtxt += "??????+"; // no path to this statement // -- HOST or DEVICE rambo sampling? @@ -1099,7 +1179,7 @@ main( int argc, char** argv ) #ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif - << "\"Curand generation\": " + << "\"Random generation\": " << "\"" << rndgentxt << "\"," << std::endl; double minelem = hstStats.minME; diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/HiprandRandomNumberKernel.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/HiprandRandomNumberKernel.cc new file mode 120000 index 0000000000..6691864f78 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/HiprandRandomNumberKernel.cc @@ -0,0 +1 @@ +../HiprandRandomNumberKernel.cc \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/check_sa.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/check_sa.cc index bde384c69e..e086ae12d9 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/check_sa.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/check_sa.cc @@ -1,10 +1,10 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2023 CERN and UCLouvain. +// Copyright (C) 2020-2024 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -58,7 +58,7 @@ int usage( char* argv0, int ret = 1 ) { std::cout << "Usage: " << argv0 - << " [--verbose|-v] [--debug|-d] [--performance|-p] [--json|-j] [--curhst|--curdev|--common] [--rmbhst|--rmbdev] [--bridge]" + << " [--verbose|-v] [--debug|-d] [--performance|-p] [--json|-j] [--curhst|--curdev|--hirhst|--hirdev|--common] [--rmbhst|--rmbdev] [--bridge]" << " [#gpuBlocksPerGrid #gpuThreadsPerBlock] #iterations" << std::endl; std::cout << std::endl; std::cout << "The number of events per iteration is #gpuBlocksPerGrid * #gpuThreadsPerBlock" << std::endl; @@ -131,17 +131,31 @@ main( int argc, char** argv ) enum class RandomNumberMode { CommonRandom = 0, - CurandHost = 1, - CurandDevice = 2 + CurandHost = -1, + CurandDevice = 1, + HiprandHost = -2, + HiprandDevice = 2 }; -#ifdef MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785) -#elif defined __HIPCC__ -#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random -#elif defined __CUDACC__ +#if defined __CUDACC__ +#ifndef MGONGPU_HAS_NO_CURAND RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand #else + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on NVidia GPU if build has no curand (PR #784 and #785) +#endif +#elif defined __HIPCC__ +#ifndef MGONGPU_HAS_NO_HIPRAND + RandomNumberMode rndgen = RandomNumberMode::HiprandDevice; // default on AMD GPU if build has hiprand +#else + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on AMD GPU if build has no hiprand +#endif +#else +#ifndef MGONGPU_HAS_NO_CURAND RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand +#elif not defined MGONGPU_HAS_NO_HIPRAND + RandomNumberMode rndgen = RandomNumberMode::HiprandDevice; // default on CPU if build has hiprand +#else + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has neither curand nor hiprand +#endif #endif // Rambo sampling mode (NB RamboHost implies CommonRandom or CurandHost!) enum class RamboSamplingMode @@ -152,7 +166,7 @@ main( int argc, char** argv ) #ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -193,6 +207,26 @@ main( int argc, char** argv ) throw std::runtime_error( "CurandHost is not supported because this application was built without Curand support" ); #else rndgen = RandomNumberMode::CurandHost; +#endif + } + else if( arg == "--hirdev" ) + { +#ifndef __HIPCC__ + throw std::runtime_error( "HiprandDevice is not supported on CPUs or non-AMD GPUs" ); +#elif defined MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "HiprandDevice is not supported because this application was built without Hiprand support" ); +#else + rndgen = RandomNumberMode::HiprandDevice; +#endif + } + else if( arg == "--hirhst" ) + { +#ifdef MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "HiprandHost is not supported because this application was built without Hiprand support" ); +#else + // See https://github.com/ROCm/hipRAND/issues/76 + throw std::runtime_error( "HiprandRandomNumberKernel on host is not supported yet (hiprandCreateGeneratorHost is not implemented yet)" ); + //rndgen = RandomNumberMode::HiprandHost; #endif } else if( arg == "--common" ) @@ -265,6 +299,20 @@ main( int argc, char** argv ) #endif } + if( rmbsmp == RamboSamplingMode::RamboHost && rndgen == RandomNumberMode::HiprandDevice ) + { +#if not defined MGONGPU_HAS_NO_HIPRAND + // See https://github.com/ROCm/hipRAND/issues/76 + //std::cout << "WARNING! RamboHost selected: cannot use HiprandDevice, will use HiprandHost" << std::endl; + //rndgen = RandomNumberMode::HiprandHost; + std::cout << "WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet)" << std::endl; + rndgen = RandomNumberMode::CommonRandom; +#else + std::cout << "WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom" << std::endl; + rndgen = RandomNumberMode::CommonRandom; +#endif + } + constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout constexpr int neppR = MemoryAccessRandomNumbers::neppR; // AOSOA layout @@ -415,7 +463,7 @@ main( int argc, char** argv ) std::unique_ptr wavetimes( new double[niter] ); std::unique_ptr wv3atimes( new double[niter] ); - // --- 0c. Create curand or common generator + // --- 0c. Create curand, hiprand or common generator const std::string cgenKey = "0c GenCreat"; timermap.start( cgenKey ); // Allocate the appropriate RandomNumberKernel @@ -433,7 +481,7 @@ main( int argc, char** argv ) prnk.reset( new CurandRandomNumberKernel( hstRndmom, onDevice ) ); #endif } - else + else if( rndgen == RandomNumberMode::CurandDevice ) { #ifdef MGONGPU_HAS_NO_CURAND throw std::runtime_error( "INTERNAL ERROR! CurandDevice is not supported because this application was built without Curand support" ); // INTERNAL ERROR (no path to this statement) @@ -441,9 +489,31 @@ main( int argc, char** argv ) const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); #else - throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif } + else if( rndgen == RandomNumberMode::HiprandHost ) + { +#ifdef MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "INTERNAL ERROR! HiprandHost is not supported because this application was built without Hiprand support" ); // INTERNAL ERROR (no path to this statement) +#else + const bool onDevice = false; + prnk.reset( new HiprandRandomNumberKernel( hstRndmom, onDevice ) ); +#endif + } + else if( rndgen == RandomNumberMode::HiprandDevice ) + { +#ifdef MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "INTERNAL ERROR! HiprandDevice is not supported because this application was built without Hiprand support" ); // INTERNAL ERROR (no path to this statement) +#elif defined __HIPCC__ + const bool onDevice = true; + prnk.reset( new HiprandRandomNumberKernel( devRndmom, onDevice ) ); +#else + throw std::logic_error( "INTERNAL ERROR! HiprandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) +#endif + } + else + throw std::logic_error( "INTERNAL ERROR! Unknown rndgen value?" ); // INTERNAL ERROR (no path to this statement) // --- 0c. Create rambo sampling kernel [keep this in 0c for the moment] std::unique_ptr prsk; @@ -497,7 +567,7 @@ main( int argc, char** argv ) // *** START THE OLD-STYLE TIMER FOR RANDOM GEN *** double genrtime = 0; - // --- 1a. Seed rnd generator (to get same results on host and device in curand) + // --- 1a. Seed rnd generator (to get same results on host and device in curand/hiprand) // [NB This should not be necessary using the host API: "Generation functions // can be called multiple times on the same generator to generate successive // blocks of results. For pseudorandom generators, multiple calls to generation @@ -515,7 +585,9 @@ main( int argc, char** argv ) //std::cout << "Got random numbers" << std::endl; #ifdef MGONGPUCPP_GPUIMPL - if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) + if( rndgen != RandomNumberMode::CurandDevice && + rndgen != RandomNumberMode::HiprandDevice && + rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device const std::string htodKey = "1c CpHTDrnd"; @@ -761,6 +833,10 @@ main( int argc, char** argv ) rndgentxt = "CURAND HOST"; else if( rndgen == RandomNumberMode::CurandDevice ) rndgentxt = "CURAND DEVICE"; + else if( rndgen == RandomNumberMode::HiprandHost ) + rndgentxt = "ROCRAND HOST"; + else if( rndgen == RandomNumberMode::HiprandDevice ) + rndgentxt = "ROCRAND DEVICE"; #ifdef __CUDACC__ rndgentxt += " (CUDA code)"; #elif defined __HIPCC__ @@ -788,7 +864,7 @@ main( int argc, char** argv ) wrkflwtxt += "FLT+"; #else wrkflwtxt += "???+"; // no path to this statement -#endif /* clang-format on */ +#endif // -- CUCOMPLEX or THRUST or STD or CXSIMPLE complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -813,7 +889,7 @@ main( int argc, char** argv ) wrkflwtxt += "CXS:"; #else wrkflwtxt += "???:"; // no path to this statement -#endif +#endif /* clang-format on */ #endif // -- COMMON or CURAND HOST or CURAND DEVICE random numbers? if( rndgen == RandomNumberMode::CommonRandom ) @@ -822,6 +898,10 @@ main( int argc, char** argv ) wrkflwtxt += "CURHST+"; else if( rndgen == RandomNumberMode::CurandDevice ) wrkflwtxt += "CURDEV+"; + else if( rndgen == RandomNumberMode::HiprandHost ) + wrkflwtxt += "HIRHST+"; + else if( rndgen == RandomNumberMode::HiprandDevice ) + wrkflwtxt += "HIRDEV+"; else wrkflwtxt += "??????+"; // no path to this statement // -- HOST or DEVICE rambo sampling? @@ -1099,7 +1179,7 @@ main( int argc, char** argv ) #ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif - << "\"Curand generation\": " + << "\"Random generation\": " << "\"" << rndgentxt << "\"," << std::endl; double minelem = hstStats.minME; diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/HiprandRandomNumberKernel.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/HiprandRandomNumberKernel.cc new file mode 120000 index 0000000000..6691864f78 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/HiprandRandomNumberKernel.cc @@ -0,0 +1 @@ +../HiprandRandomNumberKernel.cc \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/check_sa.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/check_sa.cc index bde384c69e..e086ae12d9 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/check_sa.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/check_sa.cc @@ -1,10 +1,10 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2023 CERN and UCLouvain. +// Copyright (C) 2020-2024 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -58,7 +58,7 @@ int usage( char* argv0, int ret = 1 ) { std::cout << "Usage: " << argv0 - << " [--verbose|-v] [--debug|-d] [--performance|-p] [--json|-j] [--curhst|--curdev|--common] [--rmbhst|--rmbdev] [--bridge]" + << " [--verbose|-v] [--debug|-d] [--performance|-p] [--json|-j] [--curhst|--curdev|--hirhst|--hirdev|--common] [--rmbhst|--rmbdev] [--bridge]" << " [#gpuBlocksPerGrid #gpuThreadsPerBlock] #iterations" << std::endl; std::cout << std::endl; std::cout << "The number of events per iteration is #gpuBlocksPerGrid * #gpuThreadsPerBlock" << std::endl; @@ -131,17 +131,31 @@ main( int argc, char** argv ) enum class RandomNumberMode { CommonRandom = 0, - CurandHost = 1, - CurandDevice = 2 + CurandHost = -1, + CurandDevice = 1, + HiprandHost = -2, + HiprandDevice = 2 }; -#ifdef MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785) -#elif defined __HIPCC__ -#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random -#elif defined __CUDACC__ +#if defined __CUDACC__ +#ifndef MGONGPU_HAS_NO_CURAND RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand #else + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on NVidia GPU if build has no curand (PR #784 and #785) +#endif +#elif defined __HIPCC__ +#ifndef MGONGPU_HAS_NO_HIPRAND + RandomNumberMode rndgen = RandomNumberMode::HiprandDevice; // default on AMD GPU if build has hiprand +#else + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on AMD GPU if build has no hiprand +#endif +#else +#ifndef MGONGPU_HAS_NO_CURAND RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand +#elif not defined MGONGPU_HAS_NO_HIPRAND + RandomNumberMode rndgen = RandomNumberMode::HiprandDevice; // default on CPU if build has hiprand +#else + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has neither curand nor hiprand +#endif #endif // Rambo sampling mode (NB RamboHost implies CommonRandom or CurandHost!) enum class RamboSamplingMode @@ -152,7 +166,7 @@ main( int argc, char** argv ) #ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -193,6 +207,26 @@ main( int argc, char** argv ) throw std::runtime_error( "CurandHost is not supported because this application was built without Curand support" ); #else rndgen = RandomNumberMode::CurandHost; +#endif + } + else if( arg == "--hirdev" ) + { +#ifndef __HIPCC__ + throw std::runtime_error( "HiprandDevice is not supported on CPUs or non-AMD GPUs" ); +#elif defined MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "HiprandDevice is not supported because this application was built without Hiprand support" ); +#else + rndgen = RandomNumberMode::HiprandDevice; +#endif + } + else if( arg == "--hirhst" ) + { +#ifdef MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "HiprandHost is not supported because this application was built without Hiprand support" ); +#else + // See https://github.com/ROCm/hipRAND/issues/76 + throw std::runtime_error( "HiprandRandomNumberKernel on host is not supported yet (hiprandCreateGeneratorHost is not implemented yet)" ); + //rndgen = RandomNumberMode::HiprandHost; #endif } else if( arg == "--common" ) @@ -265,6 +299,20 @@ main( int argc, char** argv ) #endif } + if( rmbsmp == RamboSamplingMode::RamboHost && rndgen == RandomNumberMode::HiprandDevice ) + { +#if not defined MGONGPU_HAS_NO_HIPRAND + // See https://github.com/ROCm/hipRAND/issues/76 + //std::cout << "WARNING! RamboHost selected: cannot use HiprandDevice, will use HiprandHost" << std::endl; + //rndgen = RandomNumberMode::HiprandHost; + std::cout << "WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet)" << std::endl; + rndgen = RandomNumberMode::CommonRandom; +#else + std::cout << "WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom" << std::endl; + rndgen = RandomNumberMode::CommonRandom; +#endif + } + constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout constexpr int neppR = MemoryAccessRandomNumbers::neppR; // AOSOA layout @@ -415,7 +463,7 @@ main( int argc, char** argv ) std::unique_ptr wavetimes( new double[niter] ); std::unique_ptr wv3atimes( new double[niter] ); - // --- 0c. Create curand or common generator + // --- 0c. Create curand, hiprand or common generator const std::string cgenKey = "0c GenCreat"; timermap.start( cgenKey ); // Allocate the appropriate RandomNumberKernel @@ -433,7 +481,7 @@ main( int argc, char** argv ) prnk.reset( new CurandRandomNumberKernel( hstRndmom, onDevice ) ); #endif } - else + else if( rndgen == RandomNumberMode::CurandDevice ) { #ifdef MGONGPU_HAS_NO_CURAND throw std::runtime_error( "INTERNAL ERROR! CurandDevice is not supported because this application was built without Curand support" ); // INTERNAL ERROR (no path to this statement) @@ -441,9 +489,31 @@ main( int argc, char** argv ) const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); #else - throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif } + else if( rndgen == RandomNumberMode::HiprandHost ) + { +#ifdef MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "INTERNAL ERROR! HiprandHost is not supported because this application was built without Hiprand support" ); // INTERNAL ERROR (no path to this statement) +#else + const bool onDevice = false; + prnk.reset( new HiprandRandomNumberKernel( hstRndmom, onDevice ) ); +#endif + } + else if( rndgen == RandomNumberMode::HiprandDevice ) + { +#ifdef MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "INTERNAL ERROR! HiprandDevice is not supported because this application was built without Hiprand support" ); // INTERNAL ERROR (no path to this statement) +#elif defined __HIPCC__ + const bool onDevice = true; + prnk.reset( new HiprandRandomNumberKernel( devRndmom, onDevice ) ); +#else + throw std::logic_error( "INTERNAL ERROR! HiprandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) +#endif + } + else + throw std::logic_error( "INTERNAL ERROR! Unknown rndgen value?" ); // INTERNAL ERROR (no path to this statement) // --- 0c. Create rambo sampling kernel [keep this in 0c for the moment] std::unique_ptr prsk; @@ -497,7 +567,7 @@ main( int argc, char** argv ) // *** START THE OLD-STYLE TIMER FOR RANDOM GEN *** double genrtime = 0; - // --- 1a. Seed rnd generator (to get same results on host and device in curand) + // --- 1a. Seed rnd generator (to get same results on host and device in curand/hiprand) // [NB This should not be necessary using the host API: "Generation functions // can be called multiple times on the same generator to generate successive // blocks of results. For pseudorandom generators, multiple calls to generation @@ -515,7 +585,9 @@ main( int argc, char** argv ) //std::cout << "Got random numbers" << std::endl; #ifdef MGONGPUCPP_GPUIMPL - if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) + if( rndgen != RandomNumberMode::CurandDevice && + rndgen != RandomNumberMode::HiprandDevice && + rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device const std::string htodKey = "1c CpHTDrnd"; @@ -761,6 +833,10 @@ main( int argc, char** argv ) rndgentxt = "CURAND HOST"; else if( rndgen == RandomNumberMode::CurandDevice ) rndgentxt = "CURAND DEVICE"; + else if( rndgen == RandomNumberMode::HiprandHost ) + rndgentxt = "ROCRAND HOST"; + else if( rndgen == RandomNumberMode::HiprandDevice ) + rndgentxt = "ROCRAND DEVICE"; #ifdef __CUDACC__ rndgentxt += " (CUDA code)"; #elif defined __HIPCC__ @@ -788,7 +864,7 @@ main( int argc, char** argv ) wrkflwtxt += "FLT+"; #else wrkflwtxt += "???+"; // no path to this statement -#endif /* clang-format on */ +#endif // -- CUCOMPLEX or THRUST or STD or CXSIMPLE complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -813,7 +889,7 @@ main( int argc, char** argv ) wrkflwtxt += "CXS:"; #else wrkflwtxt += "???:"; // no path to this statement -#endif +#endif /* clang-format on */ #endif // -- COMMON or CURAND HOST or CURAND DEVICE random numbers? if( rndgen == RandomNumberMode::CommonRandom ) @@ -822,6 +898,10 @@ main( int argc, char** argv ) wrkflwtxt += "CURHST+"; else if( rndgen == RandomNumberMode::CurandDevice ) wrkflwtxt += "CURDEV+"; + else if( rndgen == RandomNumberMode::HiprandHost ) + wrkflwtxt += "HIRHST+"; + else if( rndgen == RandomNumberMode::HiprandDevice ) + wrkflwtxt += "HIRDEV+"; else wrkflwtxt += "??????+"; // no path to this statement // -- HOST or DEVICE rambo sampling? @@ -1099,7 +1179,7 @@ main( int argc, char** argv ) #ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif - << "\"Curand generation\": " + << "\"Random generation\": " << "\"" << rndgentxt << "\"," << std::endl; double minelem = hstStats.minME; diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/HiprandRandomNumberKernel.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/HiprandRandomNumberKernel.cc new file mode 120000 index 0000000000..6691864f78 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/HiprandRandomNumberKernel.cc @@ -0,0 +1 @@ +../HiprandRandomNumberKernel.cc \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/check_sa.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/check_sa.cc index bde384c69e..e086ae12d9 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/check_sa.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/check_sa.cc @@ -1,10 +1,10 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2023 CERN and UCLouvain. +// Copyright (C) 2020-2024 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -58,7 +58,7 @@ int usage( char* argv0, int ret = 1 ) { std::cout << "Usage: " << argv0 - << " [--verbose|-v] [--debug|-d] [--performance|-p] [--json|-j] [--curhst|--curdev|--common] [--rmbhst|--rmbdev] [--bridge]" + << " [--verbose|-v] [--debug|-d] [--performance|-p] [--json|-j] [--curhst|--curdev|--hirhst|--hirdev|--common] [--rmbhst|--rmbdev] [--bridge]" << " [#gpuBlocksPerGrid #gpuThreadsPerBlock] #iterations" << std::endl; std::cout << std::endl; std::cout << "The number of events per iteration is #gpuBlocksPerGrid * #gpuThreadsPerBlock" << std::endl; @@ -131,17 +131,31 @@ main( int argc, char** argv ) enum class RandomNumberMode { CommonRandom = 0, - CurandHost = 1, - CurandDevice = 2 + CurandHost = -1, + CurandDevice = 1, + HiprandHost = -2, + HiprandDevice = 2 }; -#ifdef MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785) -#elif defined __HIPCC__ -#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random -#elif defined __CUDACC__ +#if defined __CUDACC__ +#ifndef MGONGPU_HAS_NO_CURAND RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand #else + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on NVidia GPU if build has no curand (PR #784 and #785) +#endif +#elif defined __HIPCC__ +#ifndef MGONGPU_HAS_NO_HIPRAND + RandomNumberMode rndgen = RandomNumberMode::HiprandDevice; // default on AMD GPU if build has hiprand +#else + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on AMD GPU if build has no hiprand +#endif +#else +#ifndef MGONGPU_HAS_NO_CURAND RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand +#elif not defined MGONGPU_HAS_NO_HIPRAND + RandomNumberMode rndgen = RandomNumberMode::HiprandDevice; // default on CPU if build has hiprand +#else + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has neither curand nor hiprand +#endif #endif // Rambo sampling mode (NB RamboHost implies CommonRandom or CurandHost!) enum class RamboSamplingMode @@ -152,7 +166,7 @@ main( int argc, char** argv ) #ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -193,6 +207,26 @@ main( int argc, char** argv ) throw std::runtime_error( "CurandHost is not supported because this application was built without Curand support" ); #else rndgen = RandomNumberMode::CurandHost; +#endif + } + else if( arg == "--hirdev" ) + { +#ifndef __HIPCC__ + throw std::runtime_error( "HiprandDevice is not supported on CPUs or non-AMD GPUs" ); +#elif defined MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "HiprandDevice is not supported because this application was built without Hiprand support" ); +#else + rndgen = RandomNumberMode::HiprandDevice; +#endif + } + else if( arg == "--hirhst" ) + { +#ifdef MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "HiprandHost is not supported because this application was built without Hiprand support" ); +#else + // See https://github.com/ROCm/hipRAND/issues/76 + throw std::runtime_error( "HiprandRandomNumberKernel on host is not supported yet (hiprandCreateGeneratorHost is not implemented yet)" ); + //rndgen = RandomNumberMode::HiprandHost; #endif } else if( arg == "--common" ) @@ -265,6 +299,20 @@ main( int argc, char** argv ) #endif } + if( rmbsmp == RamboSamplingMode::RamboHost && rndgen == RandomNumberMode::HiprandDevice ) + { +#if not defined MGONGPU_HAS_NO_HIPRAND + // See https://github.com/ROCm/hipRAND/issues/76 + //std::cout << "WARNING! RamboHost selected: cannot use HiprandDevice, will use HiprandHost" << std::endl; + //rndgen = RandomNumberMode::HiprandHost; + std::cout << "WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet)" << std::endl; + rndgen = RandomNumberMode::CommonRandom; +#else + std::cout << "WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom" << std::endl; + rndgen = RandomNumberMode::CommonRandom; +#endif + } + constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout constexpr int neppR = MemoryAccessRandomNumbers::neppR; // AOSOA layout @@ -415,7 +463,7 @@ main( int argc, char** argv ) std::unique_ptr wavetimes( new double[niter] ); std::unique_ptr wv3atimes( new double[niter] ); - // --- 0c. Create curand or common generator + // --- 0c. Create curand, hiprand or common generator const std::string cgenKey = "0c GenCreat"; timermap.start( cgenKey ); // Allocate the appropriate RandomNumberKernel @@ -433,7 +481,7 @@ main( int argc, char** argv ) prnk.reset( new CurandRandomNumberKernel( hstRndmom, onDevice ) ); #endif } - else + else if( rndgen == RandomNumberMode::CurandDevice ) { #ifdef MGONGPU_HAS_NO_CURAND throw std::runtime_error( "INTERNAL ERROR! CurandDevice is not supported because this application was built without Curand support" ); // INTERNAL ERROR (no path to this statement) @@ -441,9 +489,31 @@ main( int argc, char** argv ) const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); #else - throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif } + else if( rndgen == RandomNumberMode::HiprandHost ) + { +#ifdef MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "INTERNAL ERROR! HiprandHost is not supported because this application was built without Hiprand support" ); // INTERNAL ERROR (no path to this statement) +#else + const bool onDevice = false; + prnk.reset( new HiprandRandomNumberKernel( hstRndmom, onDevice ) ); +#endif + } + else if( rndgen == RandomNumberMode::HiprandDevice ) + { +#ifdef MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "INTERNAL ERROR! HiprandDevice is not supported because this application was built without Hiprand support" ); // INTERNAL ERROR (no path to this statement) +#elif defined __HIPCC__ + const bool onDevice = true; + prnk.reset( new HiprandRandomNumberKernel( devRndmom, onDevice ) ); +#else + throw std::logic_error( "INTERNAL ERROR! HiprandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) +#endif + } + else + throw std::logic_error( "INTERNAL ERROR! Unknown rndgen value?" ); // INTERNAL ERROR (no path to this statement) // --- 0c. Create rambo sampling kernel [keep this in 0c for the moment] std::unique_ptr prsk; @@ -497,7 +567,7 @@ main( int argc, char** argv ) // *** START THE OLD-STYLE TIMER FOR RANDOM GEN *** double genrtime = 0; - // --- 1a. Seed rnd generator (to get same results on host and device in curand) + // --- 1a. Seed rnd generator (to get same results on host and device in curand/hiprand) // [NB This should not be necessary using the host API: "Generation functions // can be called multiple times on the same generator to generate successive // blocks of results. For pseudorandom generators, multiple calls to generation @@ -515,7 +585,9 @@ main( int argc, char** argv ) //std::cout << "Got random numbers" << std::endl; #ifdef MGONGPUCPP_GPUIMPL - if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) + if( rndgen != RandomNumberMode::CurandDevice && + rndgen != RandomNumberMode::HiprandDevice && + rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device const std::string htodKey = "1c CpHTDrnd"; @@ -761,6 +833,10 @@ main( int argc, char** argv ) rndgentxt = "CURAND HOST"; else if( rndgen == RandomNumberMode::CurandDevice ) rndgentxt = "CURAND DEVICE"; + else if( rndgen == RandomNumberMode::HiprandHost ) + rndgentxt = "ROCRAND HOST"; + else if( rndgen == RandomNumberMode::HiprandDevice ) + rndgentxt = "ROCRAND DEVICE"; #ifdef __CUDACC__ rndgentxt += " (CUDA code)"; #elif defined __HIPCC__ @@ -788,7 +864,7 @@ main( int argc, char** argv ) wrkflwtxt += "FLT+"; #else wrkflwtxt += "???+"; // no path to this statement -#endif /* clang-format on */ +#endif // -- CUCOMPLEX or THRUST or STD or CXSIMPLE complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -813,7 +889,7 @@ main( int argc, char** argv ) wrkflwtxt += "CXS:"; #else wrkflwtxt += "???:"; // no path to this statement -#endif +#endif /* clang-format on */ #endif // -- COMMON or CURAND HOST or CURAND DEVICE random numbers? if( rndgen == RandomNumberMode::CommonRandom ) @@ -822,6 +898,10 @@ main( int argc, char** argv ) wrkflwtxt += "CURHST+"; else if( rndgen == RandomNumberMode::CurandDevice ) wrkflwtxt += "CURDEV+"; + else if( rndgen == RandomNumberMode::HiprandHost ) + wrkflwtxt += "HIRHST+"; + else if( rndgen == RandomNumberMode::HiprandDevice ) + wrkflwtxt += "HIRDEV+"; else wrkflwtxt += "??????+"; // no path to this statement // -- HOST or DEVICE rambo sampling? @@ -1099,7 +1179,7 @@ main( int argc, char** argv ) #ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif - << "\"Curand generation\": " + << "\"Random generation\": " << "\"" << rndgentxt << "\"," << std::endl; double minelem = hstStats.minME; diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/HiprandRandomNumberKernel.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/HiprandRandomNumberKernel.cc new file mode 120000 index 0000000000..6691864f78 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/HiprandRandomNumberKernel.cc @@ -0,0 +1 @@ +../HiprandRandomNumberKernel.cc \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/check_sa.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/check_sa.cc index bde384c69e..e086ae12d9 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/check_sa.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/check_sa.cc @@ -1,10 +1,10 @@ // Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. // Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. //========================================================================== -// Copyright (C) 2020-2023 CERN and UCLouvain. +// Copyright (C) 2020-2024 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -58,7 +58,7 @@ int usage( char* argv0, int ret = 1 ) { std::cout << "Usage: " << argv0 - << " [--verbose|-v] [--debug|-d] [--performance|-p] [--json|-j] [--curhst|--curdev|--common] [--rmbhst|--rmbdev] [--bridge]" + << " [--verbose|-v] [--debug|-d] [--performance|-p] [--json|-j] [--curhst|--curdev|--hirhst|--hirdev|--common] [--rmbhst|--rmbdev] [--bridge]" << " [#gpuBlocksPerGrid #gpuThreadsPerBlock] #iterations" << std::endl; std::cout << std::endl; std::cout << "The number of events per iteration is #gpuBlocksPerGrid * #gpuThreadsPerBlock" << std::endl; @@ -131,17 +131,31 @@ main( int argc, char** argv ) enum class RandomNumberMode { CommonRandom = 0, - CurandHost = 1, - CurandDevice = 2 + CurandHost = -1, + CurandDevice = 1, + HiprandHost = -2, + HiprandDevice = 2 }; -#ifdef MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785) -#elif defined __HIPCC__ -#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random -#elif defined __CUDACC__ +#if defined __CUDACC__ +#ifndef MGONGPU_HAS_NO_CURAND RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand #else + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on NVidia GPU if build has no curand (PR #784 and #785) +#endif +#elif defined __HIPCC__ +#ifndef MGONGPU_HAS_NO_HIPRAND + RandomNumberMode rndgen = RandomNumberMode::HiprandDevice; // default on AMD GPU if build has hiprand +#else + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on AMD GPU if build has no hiprand +#endif +#else +#ifndef MGONGPU_HAS_NO_CURAND RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand +#elif not defined MGONGPU_HAS_NO_HIPRAND + RandomNumberMode rndgen = RandomNumberMode::HiprandDevice; // default on CPU if build has hiprand +#else + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has neither curand nor hiprand +#endif #endif // Rambo sampling mode (NB RamboHost implies CommonRandom or CurandHost!) enum class RamboSamplingMode @@ -152,7 +166,7 @@ main( int argc, char** argv ) #ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -193,6 +207,26 @@ main( int argc, char** argv ) throw std::runtime_error( "CurandHost is not supported because this application was built without Curand support" ); #else rndgen = RandomNumberMode::CurandHost; +#endif + } + else if( arg == "--hirdev" ) + { +#ifndef __HIPCC__ + throw std::runtime_error( "HiprandDevice is not supported on CPUs or non-AMD GPUs" ); +#elif defined MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "HiprandDevice is not supported because this application was built without Hiprand support" ); +#else + rndgen = RandomNumberMode::HiprandDevice; +#endif + } + else if( arg == "--hirhst" ) + { +#ifdef MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "HiprandHost is not supported because this application was built without Hiprand support" ); +#else + // See https://github.com/ROCm/hipRAND/issues/76 + throw std::runtime_error( "HiprandRandomNumberKernel on host is not supported yet (hiprandCreateGeneratorHost is not implemented yet)" ); + //rndgen = RandomNumberMode::HiprandHost; #endif } else if( arg == "--common" ) @@ -265,6 +299,20 @@ main( int argc, char** argv ) #endif } + if( rmbsmp == RamboSamplingMode::RamboHost && rndgen == RandomNumberMode::HiprandDevice ) + { +#if not defined MGONGPU_HAS_NO_HIPRAND + // See https://github.com/ROCm/hipRAND/issues/76 + //std::cout << "WARNING! RamboHost selected: cannot use HiprandDevice, will use HiprandHost" << std::endl; + //rndgen = RandomNumberMode::HiprandHost; + std::cout << "WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet)" << std::endl; + rndgen = RandomNumberMode::CommonRandom; +#else + std::cout << "WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom" << std::endl; + rndgen = RandomNumberMode::CommonRandom; +#endif + } + constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout constexpr int neppR = MemoryAccessRandomNumbers::neppR; // AOSOA layout @@ -415,7 +463,7 @@ main( int argc, char** argv ) std::unique_ptr wavetimes( new double[niter] ); std::unique_ptr wv3atimes( new double[niter] ); - // --- 0c. Create curand or common generator + // --- 0c. Create curand, hiprand or common generator const std::string cgenKey = "0c GenCreat"; timermap.start( cgenKey ); // Allocate the appropriate RandomNumberKernel @@ -433,7 +481,7 @@ main( int argc, char** argv ) prnk.reset( new CurandRandomNumberKernel( hstRndmom, onDevice ) ); #endif } - else + else if( rndgen == RandomNumberMode::CurandDevice ) { #ifdef MGONGPU_HAS_NO_CURAND throw std::runtime_error( "INTERNAL ERROR! CurandDevice is not supported because this application was built without Curand support" ); // INTERNAL ERROR (no path to this statement) @@ -441,9 +489,31 @@ main( int argc, char** argv ) const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); #else - throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif } + else if( rndgen == RandomNumberMode::HiprandHost ) + { +#ifdef MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "INTERNAL ERROR! HiprandHost is not supported because this application was built without Hiprand support" ); // INTERNAL ERROR (no path to this statement) +#else + const bool onDevice = false; + prnk.reset( new HiprandRandomNumberKernel( hstRndmom, onDevice ) ); +#endif + } + else if( rndgen == RandomNumberMode::HiprandDevice ) + { +#ifdef MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "INTERNAL ERROR! HiprandDevice is not supported because this application was built without Hiprand support" ); // INTERNAL ERROR (no path to this statement) +#elif defined __HIPCC__ + const bool onDevice = true; + prnk.reset( new HiprandRandomNumberKernel( devRndmom, onDevice ) ); +#else + throw std::logic_error( "INTERNAL ERROR! HiprandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) +#endif + } + else + throw std::logic_error( "INTERNAL ERROR! Unknown rndgen value?" ); // INTERNAL ERROR (no path to this statement) // --- 0c. Create rambo sampling kernel [keep this in 0c for the moment] std::unique_ptr prsk; @@ -497,7 +567,7 @@ main( int argc, char** argv ) // *** START THE OLD-STYLE TIMER FOR RANDOM GEN *** double genrtime = 0; - // --- 1a. Seed rnd generator (to get same results on host and device in curand) + // --- 1a. Seed rnd generator (to get same results on host and device in curand/hiprand) // [NB This should not be necessary using the host API: "Generation functions // can be called multiple times on the same generator to generate successive // blocks of results. For pseudorandom generators, multiple calls to generation @@ -515,7 +585,9 @@ main( int argc, char** argv ) //std::cout << "Got random numbers" << std::endl; #ifdef MGONGPUCPP_GPUIMPL - if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) + if( rndgen != RandomNumberMode::CurandDevice && + rndgen != RandomNumberMode::HiprandDevice && + rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device const std::string htodKey = "1c CpHTDrnd"; @@ -761,6 +833,10 @@ main( int argc, char** argv ) rndgentxt = "CURAND HOST"; else if( rndgen == RandomNumberMode::CurandDevice ) rndgentxt = "CURAND DEVICE"; + else if( rndgen == RandomNumberMode::HiprandHost ) + rndgentxt = "ROCRAND HOST"; + else if( rndgen == RandomNumberMode::HiprandDevice ) + rndgentxt = "ROCRAND DEVICE"; #ifdef __CUDACC__ rndgentxt += " (CUDA code)"; #elif defined __HIPCC__ @@ -788,7 +864,7 @@ main( int argc, char** argv ) wrkflwtxt += "FLT+"; #else wrkflwtxt += "???+"; // no path to this statement -#endif /* clang-format on */ +#endif // -- CUCOMPLEX or THRUST or STD or CXSIMPLE complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -813,7 +889,7 @@ main( int argc, char** argv ) wrkflwtxt += "CXS:"; #else wrkflwtxt += "???:"; // no path to this statement -#endif +#endif /* clang-format on */ #endif // -- COMMON or CURAND HOST or CURAND DEVICE random numbers? if( rndgen == RandomNumberMode::CommonRandom ) @@ -822,6 +898,10 @@ main( int argc, char** argv ) wrkflwtxt += "CURHST+"; else if( rndgen == RandomNumberMode::CurandDevice ) wrkflwtxt += "CURDEV+"; + else if( rndgen == RandomNumberMode::HiprandHost ) + wrkflwtxt += "HIRHST+"; + else if( rndgen == RandomNumberMode::HiprandDevice ) + wrkflwtxt += "HIRDEV+"; else wrkflwtxt += "??????+"; // no path to this statement // -- HOST or DEVICE rambo sampling? @@ -1099,7 +1179,7 @@ main( int argc, char** argv ) #ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif - << "\"Curand generation\": " + << "\"Random generation\": " << "\"" << rndgentxt << "\"," << std::endl; double minelem = hstStats.minME; diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/RandomNumberKernels.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/RandomNumberKernels.h index 21d63beeac..7ed728a26c 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/RandomNumberKernels.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/RandomNumberKernels.h @@ -1,21 +1,22 @@ -// Copyright (C) 2020-2023 CERN and UCLouvain. +// Copyright (C) 2020-2024 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. #ifndef RANDOMNUMBERKERNELS_H #define RANDOMNUMBERKERNELS_H 1 #include "mgOnGpuConfig.h" -// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when MGONGPUCPP_GPUIMPL is not defined -#ifndef MGONGPU_HAS_NO_CURAND -//#include "curand.h" -struct curandGenerator_st; // forward definition from curand.h -#endif - #include "MemoryBuffers.h" +// Forward definition from curand.h (the full header is only needed in CurandRandomKernel.cc) +struct curandGenerator_st; + +// Forward definition from hiprand.h (the full header is only needed in HiprandRandomKernel.cc) +struct rocrand_generator_base_type; +typedef rocrand_generator_base_type hiprandGenerator_st; + #ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else @@ -107,7 +108,6 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef MGONGPU_HAS_NO_CURAND // A class encapsulating CURAND random number generation on a CPU host or on a GPU device class CurandRandomNumberKernel final : public RandomNumberKernelBase { @@ -142,11 +142,49 @@ namespace mg5amcCpu const bool m_isOnDevice; // The curand generator - // (NB: curand.h defines typedef generator_t as a pointer to forward-defined 'struct curandGenerator_st') + // (NB: curand.h defines typedef curandGenerator_t as a pointer to forward-defined 'struct curandGenerator_st') curandGenerator_st* m_rnGen; }; -#endif + //-------------------------------------------------------------------------- + + // A class encapsulating HIPRAND random number generation on a CPU host or on a GPU device + class HiprandRandomNumberKernel final : public RandomNumberKernelBase + { + public: + + // Constructor from an existing output buffer + HiprandRandomNumberKernel( BufferRndNumMomenta& rnarray, const bool onDevice ); + + // Destructor + ~HiprandRandomNumberKernel(); + + // Seed the random number generator + void seedGenerator( const unsigned int seed ) override final; + + // Generate the random number array + void generateRnarray() override final; + + // Is this a host or device kernel? + bool isOnDevice() const override final { return m_isOnDevice; } + + private: + + // Create the generator (workaround for #429: do this in every seedGenerator call rather than only in the ctor) + void createGenerator(); + + // Destroy the generator (workaround for #429: do this in every seedGenerator call rather than only in the ctor) + void destroyGenerator(); + + private: + + // Is this a host or device kernel? + const bool m_isOnDevice; + + // The hiprand generator + // (NB: hiprand.h defines typedef hiprandGenerator_t as a pointer to forward-defined 'struct hiprandGenerator_st') + hiprandGenerator_st* m_rnGen; + }; //-------------------------------------------------------------------------- } diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cudacpp.mk index 1077bdc098..3ad91dfd59 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cudacpp.mk @@ -173,11 +173,6 @@ ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) comma:=, CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch)) CUINC = -I$(CUDA_HOME)/include/ - ifeq ($(RNDGEN),hasNoCurand) - CURANDLIBFLAGS= - else - CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here! - endif CUOPTFLAGS = -lineinfo ###GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math GPUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math @@ -241,7 +236,7 @@ else override GPUCC= override USE_NVTX= override CUINC= - override CURANDLIBFLAGS= + override HIPINC= endif @@ -291,7 +286,7 @@ endif #------------------------------------------------------------------------------- -#=== Configure defaults and check if user-defined choices exist for OMPFLAGS, AVX, FPTYPE, HELINL, HRDCOD, RNDGEN +#=== Configure defaults and check if user-defined choices exist for OMPFLAGS, AVX, FPTYPE, HELINL, HRDCOD # Set the default OMPFLAGS choice ifneq ($(findstring hipcc,$(GPUCC)),) @@ -352,29 +347,62 @@ ifeq ($(HRDCOD),) override HRDCOD = 0 endif -# Set the default RNDGEN (random number generator) choice -ifeq ($(RNDGEN),) - ifeq ($(GPUCC),) - override RNDGEN = hasNoCurand - # Edgecase for HIP compilation - else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) - override RNDGEN = hasNoCurand - else ifeq ($(RNDGEN),) - override RNDGEN = hasCurand - endif -endif - -# Export AVX, FPTYPE, HELINL, HRDCOD, RNDGEN, OMPFLAGS so that it is not necessary to pass them to the src Makefile too +# Export AVX, FPTYPE, HELINL, HRDCOD, OMPFLAGS so that it is not necessary to pass them to the src Makefile too export AVX export FPTYPE export HELINL export HRDCOD -export RNDGEN export OMPFLAGS #------------------------------------------------------------------------------- -#=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN +#=== Configure defaults and check if user-defined choices exist for RNDGEN (legacy!), HASCURAND, HASHIPRAND + +# If the legacy RNDGEN exists, this take precedence over any HASCURAND choice (but a warning is printed out) +###$(info RNDGEN=$(RNDGEN)) +ifneq ($(RNDGEN),) + $(warning Environment variable RNDGEN is no longer supported, please use HASCURAND instead!) + ifeq ($(RNDGEN),hasCurand) + override HASCURAND = $(RNDGEN) + else ifeq ($(RNDGEN),hasNoCurand) + override HASCURAND = $(RNDGEN) + else ifneq ($(RNDGEN),hasNoCurand) + $(error Unknown RNDGEN='$(RNDGEN)': only 'hasCurand' and 'hasNoCurand' are supported - but use HASCURAND instead!) + endif +endif + +# Set the default HASCURAND (curand random number generator) choice, if no prior choice exists for HASCURAND +# (NB: allow HASCURAND=hasCurand even if $(GPUCC) does not point to nvcc: assume CUDA_HOME was defined correctly...) +ifeq ($(HASCURAND),) + ifeq ($(GPUCC),) # CPU-only build + override HASCURAND = hasNoCurand + else ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build + override HASCURAND = hasCurand + else # non-Nvidia GPU build + override HASCURAND = hasNoCurand + endif +endif + +# Set the default HASHIPRAND (hiprand random number generator) choice, if no prior choice exists for HASHIPRAND +# (NB: allow HASHIPRAND=hasHiprand even if $(GPUCC) does not point to hipcc: assume HIP_HOME was defined correctly...) +ifeq ($(HASHIPRAND),) + ifeq ($(GPUCC),) # CPU-only build + override HASHIPRAND = hasNoHiprand + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build + override HASHIPRAND = hasHiprand + else # non-AMD GPU build + override HASHIPRAND = hasNoHiprand + endif +endif + +# Export HASCURAND, HASHIPRAND so that it is not necessary to pass them to the src Makefile too +# (NB: these variables in cudacpp_src.mk are only used to define the build tag, they are NOT needed for RNDCXXFLAGS or RNDLIBFLAGS) +export HASCURAND +export HASHIPRAND + +#------------------------------------------------------------------------------- + +#=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD # Set the build flags appropriate to OMPFLAGS $(info OMPFLAGS=$(OMPFLAGS)) @@ -432,13 +460,13 @@ CXXFLAGS+= $(AVXFLAGS) $(info FPTYPE=$(FPTYPE)) ifeq ($(FPTYPE),d) CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE - GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE + GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE else ifeq ($(FPTYPE),f) CXXFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT - GPUFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT + GPUFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT else ifeq ($(FPTYPE),m) CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT - GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT + GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT else $(error Unknown FPTYPE='$(FPTYPE)': only 'd', 'f' and 'm' are supported) endif @@ -447,7 +475,7 @@ endif $(info HELINL=$(HELINL)) ifeq ($(HELINL),1) CXXFLAGS += -DMGONGPU_INLINE_HELAMPS - GPUFLAGS += -DMGONGPU_INLINE_HELAMPS + GPUFLAGS += -DMGONGPU_INLINE_HELAMPS else ifneq ($(HELINL),0) $(error Unknown HELINL='$(HELINL)': only '0' and '1' are supported) endif @@ -456,21 +484,40 @@ endif $(info HRDCOD=$(HRDCOD)) ifeq ($(HRDCOD),1) CXXFLAGS += -DMGONGPU_HARDCODE_PARAM - GPUFLAGS += -DMGONGPU_HARDCODE_PARAM + GPUFLAGS += -DMGONGPU_HARDCODE_PARAM else ifneq ($(HRDCOD),0) $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported) endif -# Set the build flags appropriate to each RNDGEN choice (example: "make RNDGEN=hasNoCurand") -$(info RNDGEN=$(RNDGEN)) -ifeq ($(RNDGEN),hasNoCurand) - override CXXFLAGSCURAND = -DMGONGPU_HAS_NO_CURAND -else ifeq ($(RNDGEN),hasCurand) - override CXXFLAGSCURAND = + +#=== Set the CUDA/HIP/C++ compiler and linker flags appropriate to user-defined choices of HASCURAND, HASHIPRAND + +$(info HASCURAND=$(HASCURAND)) +$(info HASHIPRAND=$(HASHIPRAND)) +override RNDCXXFLAGS= +override RNDLIBFLAGS= + +# Set the RNDCXXFLAGS and RNDLIBFLAGS build flags appropriate to each HASCURAND choice (example: "make HASCURAND=hasNoCurand") +ifeq ($(HASCURAND),hasNoCurand) + override RNDCXXFLAGS += -DMGONGPU_HAS_NO_CURAND +else ifeq ($(HASCURAND),hasCurand) + override RNDLIBFLAGS += -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here! else - $(error Unknown RNDGEN='$(RNDGEN)': only 'hasCurand' and 'hasNoCurand' are supported) + $(error Unknown HASCURAND='$(HASCURAND)': only 'hasCurand' and 'hasNoCurand' are supported) +endif + +# Set the RNDCXXFLAGS and RNDLIBFLAGS build flags appropriate to each HASHIPRAND choice (example: "make HASHIPRAND=hasNoHiprand") +ifeq ($(HASHIPRAND),hasNoHiprand) + override RNDCXXFLAGS += -DMGONGPU_HAS_NO_HIPRAND +else ifeq ($(HASHIPRAND),hasHiprand) + override RNDLIBFLAGS += -L$(HIP_HOME)/lib/ -lhiprand +else ifneq ($(HASHIPRAND),hasHiprand) + $(error Unknown HASHIPRAND='$(HASHIPRAND)': only 'hasHiprand' and 'hasNoHiprand' are supported) endif +#$(info RNDCXXFLAGS=$(RNDCXXFLAGS)) +#$(info HASHIPRAND=$(HASHIPRAND)) + #------------------------------------------------------------------------------- #=== Configure build directories and build lockfiles === @@ -481,7 +528,7 @@ override DIRTAG = $(AVX)_$(FPTYPE)_inl$(HELINL)_hrd$(HRDCOD) # Build lockfile "full" tag (defines full specification of build options that cannot be intermixed) # (Rationale: avoid mixing of CUDA and no-CUDA environment builds with different random number generators) -override TAG = $(AVX)_$(FPTYPE)_inl$(HELINL)_hrd$(HRDCOD)_$(RNDGEN) +override TAG = $(AVX)_$(FPTYPE)_inl$(HELINL)_hrd$(HRDCOD)_$(HASCURAND)_$(HASHIPRAND) # Build directory: current directory by default, or build.$(DIRTAG) if USEBUILDDIR==1 ifeq ($(USEBUILDDIR),1) @@ -589,14 +636,19 @@ endif $(BUILDDIR)/check_sa.o: CXXFLAGS += $(USE_NVTX) $(CUINC) $(BUILDDIR)/check_sa_cu.o: CXXFLAGS += $(USE_NVTX) $(CUINC) -# Apply special build flags only to check_sa[_cu].o and CurandRandomNumberKernel[_cu].o (curand headers, #679) -$(BUILDDIR)/check_sa.o: CXXFLAGS += $(CXXFLAGSCURAND) -$(BUILDDIR)/check_sa_cu.o: CUFLAGS += $(CXXFLAGSCURAND) -$(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CXXFLAGSCURAND) -$(BUILDDIR)/CurandRandomNumberKernel_cu.o: CUFLAGS += $(CXXFLAGSCURAND) -ifeq ($(RNDGEN),hasCurand) +# Apply special build flags only to check_sa[_cu].o and (Cu|Hip)randRandomNumberKernel[_cu].o +$(BUILDDIR)/check_sa.o: CXXFLAGS += $(RNDCXXFLAGS) +$(BUILDDIR)/check_sa_cu.o: CUFLAGS += $(RNDCXXFLAGS) +$(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(RNDCXXFLAGS) +$(BUILDDIR)/CurandRandomNumberKernel_cu.o: CUFLAGS += $(RNDCXXFLAGS) +$(BUILDDIR)/HiprandRandomNumberKernel.o: CXXFLAGS += $(RNDCXXFLAGS) +$(BUILDDIR)/HiprandRandomNumberKernel_cu.o: CUFLAGS += $(RNDCXXFLAGS) +ifeq ($(HASCURAND),hasCurand) # curand headers, #679 $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUINC) endif +ifeq ($(HASHIPRAND),hasHiprand) # hiprand headers +$(BUILDDIR)/HiprandRandomNumberKernel.o: CXXFLAGS += $(HIPINC) +endif # Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592) ifneq ($(shell $(CXX) --version | egrep '^(Intel)'),) @@ -673,8 +725,8 @@ endif # Target (and build rules): C++ and CUDA standalone executables $(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o - $(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CURANDLIBFLAGS) +$(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(BUILDDIR)/HiprandRandomNumberKernel.o + $(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(BUILDDIR)/HiprandRandomNumberKernel.o $(RNDLIBFLAGS) ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) @@ -684,8 +736,8 @@ else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(cu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif $(cu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(cu_main): $(BUILDDIR)/check_sa_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_cu.o - $(GPUCC) -o $@ $(BUILDDIR)/check_sa_cu.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_cu.o $(CURANDLIBFLAGS) +$(cu_main): $(BUILDDIR)/check_sa_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_cu.o $(BUILDDIR)/HiprandRandomNumberKernel_cu.o + $(GPUCC) -o $@ $(BUILDDIR)/check_sa_cu.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_cu.o $(BUILDDIR)/HiprandRandomNumberKernel_cu.o $(RNDLIBFLAGS) endif #------------------------------------------------------------------------------- diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/runTest.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/runTest.cc index de327f2321..7f248d29a4 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/runTest.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/runTest.cc @@ -238,7 +238,7 @@ struct CUDATest : public CUDA_CPU_TestBase return MemoryAccessMatrixElements::ieventAccessConst( hstMatrixElements.data(), ievt ); } }; -#endif +#endif /* clang-format off */ // Use two levels of macros to force stringification at the right level // (see https://gcc.gnu.org/onlinedocs/gcc-3.0.1/cpp_3.html#SEC17 and https://stackoverflow.com/a/3419392) @@ -260,4 +260,4 @@ INSTANTIATE_TEST_SUITE_P( prefix, \ MG_INSTANTIATE_TEST_SUITE_GPU( XTESTID_GPU( MG_EPOCH_PROCESS_ID ), MadgraphTest ); #else MG_INSTANTIATE_TEST_SUITE_CPU( XTESTID_CPU( MG_EPOCH_PROCESS_ID ), MadgraphTest ); -#endif +#endif /* clang-format on */ diff --git a/epochX/cudacpp/pp_tt012j.mad/src/cudacpp_src.mk b/epochX/cudacpp/pp_tt012j.mad/src/cudacpp_src.mk index b2b9da5288..d7d51259d2 100644 --- a/epochX/cudacpp/pp_tt012j.mad/src/cudacpp_src.mk +++ b/epochX/cudacpp/pp_tt012j.mad/src/cudacpp_src.mk @@ -45,13 +45,13 @@ endif #------------------------------------------------------------------------------- -#=== Configure the CUDA compiler (note: GPUCC is already exported including ccache) +#=== Configure the CUDA compiler (note: GPUCC have been exported from cudacpp.mk including ccache) ###$(info GPUCC=$(GPUCC)) #------------------------------------------------------------------------------- -#=== Configure ccache for C++ builds (note: GPUCC is already exported including ccache) +#=== Configure ccache for C++ builds (note: GPUCC have been exported from cudacpp.mk including ccache) # Enable ccache if USECCACHE=1 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) @@ -86,7 +86,8 @@ endif #------------------------------------------------------------------------------- -#=== Set the CUDA/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN +#=== Set the CUDA/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD (exported from cudacpp.mk) +#=== (NB the RNDCXXFLAGS and RNDLIBFLAGS appropriate to user-defined choices of HASCURAND and HASHIPRAND have been exported from cudacpp.mk) # Set the build flags appropriate to OMPFLAGS ###$(info OMPFLAGS=$(OMPFLAGS)) @@ -175,14 +176,6 @@ else ifneq ($(HRDCOD),0) $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported) endif -# Set the build flags appropriate to each RNDGEN choice (example: "make RNDGEN=hasNoCurand") -###$(info RNDGEN=$(RNDGEN)) -ifeq ($(RNDGEN),hasNoCurand) - CXXFLAGS += -DMGONGPU_HAS_NO_CURAND -else ifneq ($(RNDGEN),hasCurand) - $(error Unknown RNDGEN='$(RNDGEN)': only 'hasCurand' and 'hasNoCurand' are supported) -endif - #------------------------------------------------------------------------------- #=== Configure build directories and build lockfiles === @@ -193,7 +186,7 @@ override DIRTAG = $(AVX)_$(FPTYPE)_inl$(HELINL)_hrd$(HRDCOD) # Build lockfile "full" tag (defines full specification of build options that cannot be intermixed) # (Rationale: avoid mixing of CUDA and no-CUDA environment builds with different random number generators) -override TAG = $(AVX)_$(FPTYPE)_inl$(HELINL)_hrd$(HRDCOD)_$(RNDGEN) +override TAG = $(AVX)_$(FPTYPE)_inl$(HELINL)_hrd$(HRDCOD)_$(HASCURAND)_$(HASHIPRAND) # Build directory: current directory by default, or build.$(DIRTAG) if USEBUILDDIR==1 ###$(info Current directory is $(shell pwd)) diff --git a/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuConfig.h index 6bde4466d0..33306bc3e2 100644 --- a/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuConfig.h @@ -20,11 +20,15 @@ #undef MGONGPUCPP_GPUIMPL #endif +// Make sure that __HIP_PLATFORM_NVIDIA__ is undefined +// (__HIP_PLATFORM_AMD__ is defined by hipcc or in HiprandRandomNumberKernel.cc) +#undef __HIP_PLATFORM_NVIDIA__ // disable hiprand for NVidia (curand) + // ** NB1 Throughputs (e.g. 6.8E8) are events/sec for "./gcheck.exe -p 65536 128 12" // ** NB2 Baseline on b7g47n0004 fluctuates (probably depends on load on other VMs) // Choose if curand is supported for generating random numbers -// For HIP, by default, do not use curand (common random numbers will be used instead) +// For HIP, by default, do not allow curand to be used (hiprand or common random numbers will be used instead) // For both CUDA and C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND // (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784 and #785) #if defined __HIPCC__ @@ -39,6 +43,22 @@ //#endif #endif +// Choose if hiprand is supported for generating random numbers +// For CUDA, by default, do not allow hiprand to be used (curand or common random numbers will be used instead) +// For both HIP and C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_HIPRAND +// (there may exist HIP installations which do not include hiprand?) +#if defined __CUDACC__ +#define MGONGPU_HAS_NO_HIPRAND 1 +#else +//#ifdef __HIPCC__ +//#undef MGONGPU_HAS_NO_HIPRAND // default +////#define MGONGPU_HAS_NO_HIPRAND 1 +//#else +//#undef MGONGPU_HAS_NO_HIPRAND // default +////#define MGONGPU_HAS_NO_HIPRAND 1 +//#endif +#endif + // Choose floating point precision (for everything but color algebra #537) // If one of these macros has been set from outside with e.g. -DMGONGPU_FPTYPE_FLOAT, nothing happens (issue #167) #if not defined MGONGPU_FPTYPE_DOUBLE and not defined MGONGPU_FPTYPE_FLOAT diff --git a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt index 903b6ba92d..b608406eb3 100644 --- a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +Working directory (build): /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum CUDACPP_BUILDDIR='.' - - make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' + make USEBUILDDIR=1 AVX=avx2 + make USEBUILDDIR=1 AVX=512y +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' OMP_NUM_THREADS= -DATE: 2024-02-02_17:29:35 +DATE: 2024-02-08_19:35:27 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +Working directory (run): /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,18 +50,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/avalassi/output_eemumu_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_eemumu_x1_fortran > /tmp/valassia/output_eemumu_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09338 [9.3382715404661532E-002] fbridge_mode=0 + [XSECTION] Cross section = 0.09338 [9.3382715404661518E-002] fbridge_mode=0 [UNWEIGHT] Wrote 3798 events (found 8192 events) - [COUNTERS] PROGRAM TOTAL : 0.6788s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6704s - [COUNTERS] Fortran MEs ( 1 ) : 0.0084s for 8192 events => throughput is 9.77E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.5391s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5331s + [COUNTERS] Fortran MEs ( 1 ) : 0.0060s for 8192 events => throughput is 1.37E+06 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,18 +75,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/avalassi/output_eemumu_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_eemumu_x1_fortran > /tmp/valassia/output_eemumu_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09338 [9.3382715404661532E-002] fbridge_mode=0 + [XSECTION] Cross section = 0.09338 [9.3382715404661518E-002] fbridge_mode=0 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1745s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1662s - [COUNTERS] Fortran MEs ( 1 ) : 0.0082s for 8192 events => throughput is 9.96E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.1350s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1291s + [COUNTERS] Fortran MEs ( 1 ) : 0.0059s for 8192 events => throughput is 1.40E+06 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,8 +100,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x10_fortran > /tmp/avalassi/output_eemumu_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_eemumu_x10_fortran > /tmp/valassia/output_eemumu_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x10_fortran > /tmp/a [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515602020000766E-002] fbridge_mode=0 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.3733s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2821s - [COUNTERS] Fortran MEs ( 1 ) : 0.0912s for 90112 events => throughput is 9.88E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2765s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2141s + [COUNTERS] Fortran MEs ( 1 ) : 0.0624s for 90112 events => throughput is 1.45E+06 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,8 +125,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x1_cudacpp > /tmp/valassia/output_eemumu_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -134,13 +134,13 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382715404661532E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1813s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1740s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0073s for 8192 events => throughput is 1.12E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1583s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1523s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0060s for 8192 events => throughput is 1.37E+06 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.3382715404661532E-002) and cpp (9.3382715404661532E-002) differ by less than 3E-14 (0.0) +OK! xsec from fortran (9.3382715404661518E-002) and cpp (9.3382715404661532E-002) differ by less than 3E-14 (2.220446049250313e-16) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -158,8 +158,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x10_cudacpp > /tmp/valassia/output_eemumu_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -167,9 +167,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515602020000780E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.3645s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2877s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0768s for 90112 events => throughput is 1.17E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.2903s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2247s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0655s for 90112 events => throughput is 1.38E+06 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -180,14 +180,14 @@ OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515602020000780E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.136025e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.419157e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.134340e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.436418e+06 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,22 +201,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x1_cudacpp > /tmp/valassia/output_eemumu_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09338 [9.3382715404661532E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.09338 [9.3382715404661518E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1746s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1705s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0041s for 8192 events => throughput is 2.00E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1400s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1365s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0035s for 8192 events => throughput is 2.33E+06 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.3382715404661532E-002) and cpp (9.3382715404661532E-002) differ by less than 3E-14 (0.0) +OK! xsec from fortran (9.3382715404661518E-002) and cpp (9.3382715404661518E-002) differ by less than 3E-14 (0.0) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -234,8 +234,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x10_cudacpp > /tmp/valassia/output_eemumu_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -243,9 +243,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515602020000753E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.3325s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2870s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0455s for 90112 events => throughput is 1.98E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.2596s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2210s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0387s for 90112 events => throughput is 2.33E+06 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -256,14 +256,14 @@ OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515602020000753E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.944838e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.381265e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.013875e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.442923e+06 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,8 +277,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x1_cudacpp > /tmp/valassia/output_eemumu_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -286,13 +286,13 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382715404661532E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1735s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1703s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0032s for 8192 events => throughput is 2.59E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1388s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1363s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0025s for 8192 events => throughput is 3.29E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.3382715404661532E-002) and cpp (9.3382715404661532E-002) differ by less than 3E-14 (0.0) +OK! xsec from fortran (9.3382715404661518E-002) and cpp (9.3382715404661532E-002) differ by less than 3E-14 (2.220446049250313e-16) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -310,8 +310,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x10_cudacpp > /tmp/valassia/output_eemumu_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -319,9 +319,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515602020000753E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.3185s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2848s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0337s for 90112 events => throughput is 2.67E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.2483s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2211s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0273s for 90112 events => throughput is 3.31E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -332,166 +332,18 @@ OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515602020000753E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.584573e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.446000e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.738752e+06 ) sec^-1 - -*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 4/16 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09338 [9.3382715404661532E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1713s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1685s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0028s for 8192 events => throughput is 2.91E+06 events/s - -*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (9.3382715404661532E-002) and cpp (9.3382715404661532E-002) differ by less than 3E-14 (0.0) - -*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.550050e+06 ) sec^-1 -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical +*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** -*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 4/16 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09152 [9.1515602020000753E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.3165s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2844s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0322s for 90112 events => throughput is 2.80E+06 events/s - -*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515602020000753E-002) differ by less than 3E-14 (1.1102230246251565e-16) - -*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.823433e+06 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.999514e+06 ) sec^-1 - -*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 4/16 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09338 [9.3382715404661462E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1752s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1718s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0033s for 8192 events => throughput is 2.45E+06 events/s - -*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (9.3382715404661532E-002) and cpp (9.3382715404661462E-002) differ by less than 3E-14 (7.771561172376096e-16) - -*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 4/16 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09152 [9.1515602020000739E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.3254s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2890s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0364s for 90112 events => throughput is 2.47E+06 events/s - -*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515602020000739E-002) differ by less than 3E-14 (3.3306690738754696e-16) - -*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.318962e+06 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.390989e+06 ) sec^-1 +*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -505,22 +357,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 4/16 +Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/valassia/input_eemumu_x1_cudacpp > /tmp/valassia/output_eemumu_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09338 [9.3382715404661532E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.09338 [9.3382715404661545E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.5889s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5884s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0005s for 8192 events => throughput is 1.57E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.4196s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4192s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0004s for 8192 events => throughput is 1.99E+07 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.3382715404661532E-002) and cpp (9.3382715404661532E-002) differ by less than 3E-14 (0.0) +OK! xsec from fortran (9.3382715404661518E-002) and cpp (9.3382715404661545E-002) differ by less than 3E-14 (2.220446049250313e-16) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -538,18 +390,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 4/16 +Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/valassia/input_eemumu_x10_cudacpp > /tmp/valassia/output_eemumu_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515602020000753E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.7091s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7042s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0049s for 90112 events => throughput is 1.83E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.5013s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4970s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0043s for 90112 events => throughput is 2.08E+07 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** @@ -560,43 +412,43 @@ OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515602020000753E-002 OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.120334e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.190744e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.962861e+08 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.588028e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.703996e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.251687e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.442262e+08 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.876460e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.738155e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.274146e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.997755e+08 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.960127e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.714770e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.226416e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.130577e+08 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.552390e+07 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt index 758878788d..fe2c61101c 100644 --- a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +Working directory (build): /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum CUDACPP_BUILDDIR='.' - make USEBUILDDIR=1 AVX=none -make USEBUILDDIR=1 AVX=sse4 +make USEBUILDDIR=1 AVX=sse4 +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 + make USEBUILDDIR=1 AVX=512y +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' OMP_NUM_THREADS= -DATE: 2024-02-02_17:29:51 +DATE: 2024-02-08_19:35:47 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +Working directory (run): /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,18 +50,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/avalassi/output_eemumu_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_eemumu_x1_fortran > /tmp/valassia/output_eemumu_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09338 [9.3382715404661532E-002] fbridge_mode=0 + [XSECTION] Cross section = 0.09338 [9.3382715404661518E-002] fbridge_mode=0 [UNWEIGHT] Wrote 3798 events (found 8192 events) - [COUNTERS] PROGRAM TOTAL : 0.6711s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6627s - [COUNTERS] Fortran MEs ( 1 ) : 0.0084s for 8192 events => throughput is 9.81E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.5107s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5046s + [COUNTERS] Fortran MEs ( 1 ) : 0.0061s for 8192 events => throughput is 1.35E+06 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,18 +75,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/avalassi/output_eemumu_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_eemumu_x1_fortran > /tmp/valassia/output_eemumu_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09338 [9.3382715404661532E-002] fbridge_mode=0 + [XSECTION] Cross section = 0.09338 [9.3382715404661518E-002] fbridge_mode=0 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1805s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1717s - [COUNTERS] Fortran MEs ( 1 ) : 0.0088s for 8192 events => throughput is 9.32E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.1360s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1301s + [COUNTERS] Fortran MEs ( 1 ) : 0.0059s for 8192 events => throughput is 1.40E+06 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,8 +100,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x10_fortran > /tmp/avalassi/output_eemumu_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_eemumu_x10_fortran > /tmp/valassia/output_eemumu_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x10_fortran > /tmp/a [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515602020000766E-002] fbridge_mode=0 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.3725s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2819s - [COUNTERS] Fortran MEs ( 1 ) : 0.0906s for 90112 events => throughput is 9.94E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2782s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2158s + [COUNTERS] Fortran MEs ( 1 ) : 0.0623s for 90112 events => throughput is 1.45E+06 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,22 +125,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x1_cudacpp > /tmp/valassia/output_eemumu_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09338 [9.3382700437610044E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.09338 [9.3382701684199335E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1778s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1713s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0065s for 8192 events => throughput is 1.27E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1459s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1407s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0052s for 8192 events => throughput is 1.59E+06 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.3382715404661532E-002) and cpp (9.3382700437610044E-002) differ by less than 4E-4 (1.6027646465577305e-07) +OK! xsec from fortran (9.3382715404661518E-002) and cpp (9.3382701684199335E-002) differ by less than 4E-4 (1.4692721372888684e-07) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -158,36 +158,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x10_cudacpp > /tmp/valassia/output_eemumu_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09152 [9.1515587669165246E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.09152 [9.1515588842633111E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.3615s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2883s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0731s for 90112 events => throughput is 1.23E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.2790s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2225s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0566s for 90112 events => throughput is 1.59E+06 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515587669165246E-002) differ by less than 4E-4 (1.568129937012941e-07) +OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515588842633111E-002) differ by less than 4E-4 (1.439903947186849e-07) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.215528e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.663270e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.215728e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.674806e+06 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,22 +201,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x1_cudacpp > /tmp/valassia/output_eemumu_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09338 [9.3382700723828302E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.09338 [9.3382719831741665E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1704s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1678s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0026s for 8192 events => throughput is 3.12E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1377s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1356s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0021s for 8192 events => throughput is 3.83E+06 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.3382715404661532E-002) and cpp (9.3382700723828302E-002) differ by less than 4E-4 (1.5721146218172777e-07) +OK! xsec from fortran (9.3382715404661518E-002) and cpp (9.3382719831741665E-002) differ by less than 4E-4 (4.740791825774693e-08) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -234,36 +234,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x10_cudacpp > /tmp/valassia/output_eemumu_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09152 [9.1515587612890761E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.09152 [9.1515606481761602E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.3127s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2844s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0283s for 90112 events => throughput is 3.18E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.2429s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2195s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0234s for 90112 events => throughput is 3.85E+06 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515587612890761E-002) differ by less than 4E-4 (1.5742791048545257e-07) +OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515606481761602E-002) differ by less than 4E-4 (4.875410031246474e-08) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.154731e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.044701e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.263737e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.159562e+06 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,22 +277,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x1_cudacpp > /tmp/valassia/output_eemumu_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09338 [9.3382700679354239E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.09338 [9.3382719700521907E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1711s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1686s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0024s for 8192 events => throughput is 3.38E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1369s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1351s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0018s for 8192 events => throughput is 4.59E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.3382715404661532E-002) and cpp (9.3382700679354239E-002) differ by less than 4E-4 (1.576877179942926e-07) +OK! xsec from fortran (9.3382715404661518E-002) and cpp (9.3382719700521907E-002) differ by less than 4E-4 (4.6002735842876064e-08) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -310,188 +310,40 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x10_cudacpp > /tmp/valassia/output_eemumu_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09152 [9.1515587619408464E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.09152 [9.1515606480805645E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.3091s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2838s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0252s for 90112 events => throughput is 3.57E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.2430s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2234s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0196s for 90112 events => throughput is 4.60E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515587619408464E-002) differ by less than 4E-4 (1.573566908996682e-07) +OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515606480805645E-002) differ by less than 4E-4 (4.874365444607065e-08) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.630979e+06 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.779764e+06 ) sec^-1 - -*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 4/16 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09338 [9.3382700679354239E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1700s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1678s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0022s for 8192 events => throughput is 3.81E+06 events/s - -*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (9.3382715404661532E-002) and cpp (9.3382700679354239E-002) differ by less than 4E-4 (1.576877179942926e-07) - -*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 4/16 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09152 [9.1515587619408464E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.3088s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2854s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0234s for 90112 events => throughput is 3.85E+06 events/s - -*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515587619408464E-002) differ by less than 4E-4 (1.573566908996682e-07) - -*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.837087e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.885202e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.068356e+06 ) sec^-1 - -*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 4/16 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09338 [9.3382704356154977E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1710s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1688s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0022s for 8192 events => throughput is 3.79E+06 events/s +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.038589e+06 ) sec^-1 -*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** +*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** -OK! xsec from fortran (9.3382715404661532E-002) and cpp (9.3382704356154977E-002) differ by less than 4E-4 (1.1831425661412709e-07) - -*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 4/16 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09152 [9.1515591292297929E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.3132s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2882s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0250s for 90112 events => throughput is 3.61E+06 events/s - -*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515591292297929E-002) differ by less than 4E-4 (1.172226659074127e-07) - -*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.512697e+06 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.888864e+06 ) sec^-1 +*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -505,22 +357,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 4/16 +Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/valassia/input_eemumu_x1_cudacpp > /tmp/valassia/output_eemumu_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09338 [9.3382706077425631E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.09338 [9.3382704338101225E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.5860s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5855s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0005s for 8192 events => throughput is 1.70E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.4188s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4185s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0003s for 8192 events => throughput is 2.86E+07 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.3382715404661532E-002) and cpp (9.3382706077425631E-002) differ by less than 4E-4 (9.988182347875352e-08) +OK! xsec from fortran (9.3382715404661518E-002) and cpp (9.3382704338101225E-002) differ by less than 4E-4 (1.1850758729892164e-07) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -538,65 +390,65 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 4/16 +Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/valassia/input_eemumu_x10_cudacpp > /tmp/valassia/output_eemumu_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09152 [9.1515592892887687E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.09152 [9.1515591361999701E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.7196s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7149s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0047s for 90112 events => throughput is 1.90E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.4980s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4951s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0030s for 90112 events => throughput is 3.03E+07 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515592892887687E-002) differ by less than 4E-4 (9.973286385633884e-08) +OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515591361999701E-002) differ by less than 4E-4 (1.1646102771045719e-07) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.248963e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.742092e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.965001e+08 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.171998e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.028795e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.342442e+08 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.036734e+09 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.682729e+08 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.922721e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.347808e+08 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.234779e+09 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.842035e+08 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.375978e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.117791e+08 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.458252e+08 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.921682e+07 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt index b045ca6fab..66dad24eb4 100644 --- a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +Working directory (build): /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum CUDACPP_BUILDDIR='.' - make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 + make USEBUILDDIR=1 AVX=512y +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' OMP_NUM_THREADS= -DATE: 2024-02-02_17:30:08 +DATE: 2024-02-08_19:36:07 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +Working directory (run): /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,18 +50,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/avalassi/output_eemumu_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_eemumu_x1_fortran > /tmp/valassia/output_eemumu_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09338 [9.3382715404661532E-002] fbridge_mode=0 + [XSECTION] Cross section = 0.09338 [9.3382715404661518E-002] fbridge_mode=0 [UNWEIGHT] Wrote 3798 events (found 8192 events) - [COUNTERS] PROGRAM TOTAL : 0.6792s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6708s - [COUNTERS] Fortran MEs ( 1 ) : 0.0084s for 8192 events => throughput is 9.72E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.5099s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5040s + [COUNTERS] Fortran MEs ( 1 ) : 0.0059s for 8192 events => throughput is 1.40E+06 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,18 +75,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/avalassi/output_eemumu_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_eemumu_x1_fortran > /tmp/valassia/output_eemumu_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09338 [9.3382715404661532E-002] fbridge_mode=0 + [XSECTION] Cross section = 0.09338 [9.3382715404661518E-002] fbridge_mode=0 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1762s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1679s - [COUNTERS] Fortran MEs ( 1 ) : 0.0083s for 8192 events => throughput is 9.82E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.1359s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1301s + [COUNTERS] Fortran MEs ( 1 ) : 0.0059s for 8192 events => throughput is 1.40E+06 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,8 +100,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x10_fortran > /tmp/avalassi/output_eemumu_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_eemumu_x10_fortran > /tmp/valassia/output_eemumu_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x10_fortran > /tmp/a [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515602020000766E-002] fbridge_mode=0 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.3933s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2984s - [COUNTERS] Fortran MEs ( 1 ) : 0.0949s for 90112 events => throughput is 9.49E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2793s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2170s + [COUNTERS] Fortran MEs ( 1 ) : 0.0623s for 90112 events => throughput is 1.45E+06 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,8 +125,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x1_cudacpp > /tmp/valassia/output_eemumu_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -134,13 +134,13 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382715420701354E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1852s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1777s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0075s for 8192 events => throughput is 1.09E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1437s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1378s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0060s for 8192 events => throughput is 1.38E+06 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.3382715404661532E-002) and cpp (9.3382715420701354E-002) differ by less than 2E-4 (1.7176438049659737e-10) +OK! xsec from fortran (9.3382715404661518E-002) and cpp (9.3382715420701354E-002) differ by less than 2E-4 (1.717646025412023e-10) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -158,8 +158,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x10_cudacpp > /tmp/valassia/output_eemumu_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -167,9 +167,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515602033080859E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.3742s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2963s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0779s for 90112 events => throughput is 1.16E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.2896s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2240s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0656s for 90112 events => throughput is 1.37E+06 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -180,14 +180,14 @@ OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515602033080859E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.113110e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.412457e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.110771e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.429501e+06 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,8 +201,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x1_cudacpp > /tmp/valassia/output_eemumu_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -210,13 +210,13 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382715420701354E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1756s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1716s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0040s for 8192 events => throughput is 2.04E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1380s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1345s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0035s for 8192 events => throughput is 2.37E+06 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.3382715404661532E-002) and cpp (9.3382715420701354E-002) differ by less than 2E-4 (1.7176438049659737e-10) +OK! xsec from fortran (9.3382715404661518E-002) and cpp (9.3382715420701354E-002) differ by less than 2E-4 (1.717646025412023e-10) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -234,8 +234,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x10_cudacpp > /tmp/valassia/output_eemumu_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -243,9 +243,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515602033080859E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.3281s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2840s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0441s for 90112 events => throughput is 2.04E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.2586s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2205s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0381s for 90112 events => throughput is 2.36E+06 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -256,14 +256,14 @@ OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515602033080859E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.972024e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.500136e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.077909e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.529098e+06 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,8 +277,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x1_cudacpp > /tmp/valassia/output_eemumu_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -286,13 +286,13 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382715383664494E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1732s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1701s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0031s for 8192 events => throughput is 2.62E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1374s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1348s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0026s for 8192 events => throughput is 3.19E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.3382715404661532E-002) and cpp (9.3382715383664494E-002) differ by less than 2E-4 (2.2484925032983938e-10) +OK! xsec from fortran (9.3382715404661518E-002) and cpp (9.3382715383664494E-002) differ by less than 2E-4 (2.2484913930753692e-10) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -310,8 +310,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x10_cudacpp > /tmp/valassia/output_eemumu_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -319,9 +319,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515602022697845E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.3236s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2886s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0351s for 90112 events => throughput is 2.57E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.2495s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2212s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0282s for 90112 events => throughput is 3.19E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -332,166 +332,18 @@ OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515602022697845E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.465781e+06 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.656537e+06 ) sec^-1 - -*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 4/16 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09338 [9.3382715383664494E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1731s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1702s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0030s for 8192 events => throughput is 2.76E+06 events/s - -*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (9.3382715404661532E-002) and cpp (9.3382715383664494E-002) differ by less than 2E-4 (2.2484925032983938e-10) - -*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 4/16 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09152 [9.1515602022697845E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.3222s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2895s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0327s for 90112 events => throughput is 2.76E+06 events/s - -*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515602022697845E-002) differ by less than 2E-4 (2.947131427788463e-11) - -*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.736897e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.322870e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.910621e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.423505e+06 ) sec^-1 -*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 4/16 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09338 [9.3382715383664494E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1732s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1699s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0033s for 8192 events => throughput is 2.50E+06 events/s +*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** -*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (9.3382715404661532E-002) and cpp (9.3382715383664494E-002) differ by less than 2E-4 (2.2484925032983938e-10) - -*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 4/16 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09152 [9.1515602022697845E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.3251s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2881s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0370s for 90112 events => throughput is 2.44E+06 events/s - -*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515602022697845E-002) differ by less than 2E-4 (2.947131427788463e-11) - -*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.380540e+06 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.468939e+06 ) sec^-1 +*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -505,22 +357,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 4/16 +Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/valassia/input_eemumu_x1_cudacpp > /tmp/valassia/output_eemumu_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09338 [9.3382715392009194E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.09338 [9.3382715392009222E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.5915s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5910s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0005s for 8192 events => throughput is 1.60E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.4108s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4103s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0005s for 8192 events => throughput is 1.80E+07 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.3382715404661532E-002) and cpp (9.3382715392009194E-002) differ by less than 2E-4 (1.3548906441229747e-10) +OK! xsec from fortran (9.3382715404661518E-002) and cpp (9.3382715392009222E-002) differ by less than 2E-4 (1.3548862032308762e-10) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -538,18 +390,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 4/16 +Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/valassia/input_eemumu_x10_cudacpp > /tmp/valassia/output_eemumu_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515602021089631E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.7103s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7054s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0049s for 90112 events => throughput is 1.84E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.4993s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4950s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0043s for 90112 events => throughput is 2.09E+07 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** @@ -560,43 +412,43 @@ OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515602021089631E-002 OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.965741e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.203414e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.950244e+08 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.593725e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.732879e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.287558e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.458312e+08 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.882542e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.736445e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.287734e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.983944e+08 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.955308e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.735816e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.220279e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.141756e+08 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.554716e+07 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt index 0edfe47d2b..7d092ae287 100644 --- a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +Working directory (build): /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx CUDACPP_BUILDDIR='.' - - - make USEBUILDDIR=1 AVX=none + make USEBUILDDIR=1 AVX=sse4 +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + make USEBUILDDIR=1 AVX=avx2 + make USEBUILDDIR=1 AVX=512y +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' OMP_NUM_THREADS= -DATE: 2024-02-02_17:30:25 +DATE: 2024-02-08_19:36:26 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +Working directory (run): /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,18 +50,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/avalassi/output_ggtt_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggtt_x1_fortran > /tmp/valassia/output_ggtt_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.09 [47.094184803756640] fbridge_mode=0 + [XSECTION] Cross section = 47.09 [47.094184803756626] fbridge_mode=0 [UNWEIGHT] Wrote 2601 events (found 5405 events) - [COUNTERS] PROGRAM TOTAL : 0.7950s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7522s - [COUNTERS] Fortran MEs ( 1 ) : 0.0427s for 8192 events => throughput is 1.92E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.6734s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6448s + [COUNTERS] Fortran MEs ( 1 ) : 0.0287s for 8192 events => throughput is 2.86E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,18 +75,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/avalassi/output_ggtt_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggtt_x1_fortran > /tmp/valassia/output_ggtt_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.09 [47.094184803756640] fbridge_mode=0 + [XSECTION] Cross section = 47.09 [47.094184803756626] fbridge_mode=0 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.3986s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3566s - [COUNTERS] Fortran MEs ( 1 ) : 0.0420s for 8192 events => throughput is 1.95E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3167s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2881s + [COUNTERS] Fortran MEs ( 1 ) : 0.0286s for 8192 events => throughput is 2.86E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,18 +100,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x10_fortran > /tmp/avalassi/output_ggtt_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggtt_x10_fortran > /tmp/valassia/output_ggtt_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.11 [47.105695279989099] fbridge_mode=0 + [XSECTION] Cross section = 47.11 [47.105695279989114] fbridge_mode=0 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.7986s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3414s - [COUNTERS] Fortran MEs ( 1 ) : 0.4572s for 90112 events => throughput is 1.97E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.3427s + [COUNTERS] Fortran Overhead ( 0 ) : 1.0308s + [COUNTERS] Fortran MEs ( 1 ) : 0.3120s for 90112 events => throughput is 2.89E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,22 +125,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x1_cudacpp > /tmp/valassia/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.09 [47.094184803756647] fbridge_mode=1 + [XSECTION] Cross section = 47.09 [47.094184803756640] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.4328s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3950s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0378s for 8192 events => throughput is 2.16E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3660s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3340s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0320s for 8192 events => throughput is 2.56E+05 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.094184803756640) and cpp (47.094184803756647) differ by less than 3E-14 (2.220446049250313e-16) +OK! xsec from fortran (47.094184803756626) and cpp (47.094184803756640) differ by less than 3E-14 (2.220446049250313e-16) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -158,36 +158,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x10_cudacpp > /tmp/valassia/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.11 [47.105695279989121] fbridge_mode=1 + [XSECTION] Cross section = 47.11 [47.105695279989099] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.7474s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3356s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.4117s for 90112 events => throughput is 2.19E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.4147s + [COUNTERS] Fortran Overhead ( 0 ) : 1.0627s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3520s for 90112 events => throughput is 2.56E+05 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.105695279989099) and cpp (47.105695279989121) differ by less than 3E-14 (4.440892098500626e-16) +OK! xsec from fortran (47.105695279989114) and cpp (47.105695279989099) differ by less than 3E-14 (3.3306690738754696e-16) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.202621e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.597150e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.194453e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.603334e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,22 +201,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x1_cudacpp > /tmp/valassia/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.09 [47.094184803756647] fbridge_mode=1 + [XSECTION] Cross section = 47.09 [47.094184803756619] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.3966s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3751s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0216s for 8192 events => throughput is 3.80E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3271s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3094s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0176s for 8192 events => throughput is 4.64E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.094184803756640) and cpp (47.094184803756647) differ by less than 3E-14 (2.220446049250313e-16) +OK! xsec from fortran (47.094184803756626) and cpp (47.094184803756619) differ by less than 3E-14 (1.1102230246251565e-16) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -234,36 +234,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x10_cudacpp > /tmp/valassia/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.11 [47.105695279989106] fbridge_mode=1 + [XSECTION] Cross section = 47.11 [47.105695279989085] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.5574s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3203s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2371s for 90112 events => throughput is 3.80E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.2488s + [COUNTERS] Fortran Overhead ( 0 ) : 1.0540s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1947s for 90112 events => throughput is 4.63E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.105695279989099) and cpp (47.105695279989106) differ by less than 3E-14 (2.220446049250313e-16) +OK! xsec from fortran (47.105695279989114) and cpp (47.105695279989085) differ by less than 3E-14 (5.551115123125783e-16) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.802932e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.756491e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.752190e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.783795e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,8 +277,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x1_cudacpp > /tmp/valassia/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -286,13 +286,13 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.09 [47.094184803756640] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.3848s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3713s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0135s for 8192 events => throughput is 6.08E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3153s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3050s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0103s for 8192 events => throughput is 7.99E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.094184803756640) and cpp (47.094184803756640) differ by less than 3E-14 (0.0) +OK! xsec from fortran (47.094184803756626) and cpp (47.094184803756640) differ by less than 3E-14 (2.220446049250313e-16) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -310,188 +310,40 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x10_cudacpp > /tmp/valassia/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.11 [47.105695279989121] fbridge_mode=1 + [XSECTION] Cross section = 47.11 [47.105695279989114] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.4675s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3173s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1503s for 90112 events => throughput is 6.00E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.1526s + [COUNTERS] Fortran Overhead ( 0 ) : 1.0398s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1128s for 90112 events => throughput is 7.99E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.105695279989099) and cpp (47.105695279989121) differ by less than 3E-14 (4.440892098500626e-16) +OK! xsec from fortran (47.105695279989114) and cpp (47.105695279989114) differ by less than 3E-14 (0.0) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.931066e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.276102e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.034242e+05 ) sec^-1 - -*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/16 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.09 [47.094184803756640] fbridge_mode=1 - [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.3813s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3697s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0116s for 8192 events => throughput is 7.05E+05 events/s - -*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (47.094184803756640) and cpp (47.094184803756640) differ by less than 3E-14 (0.0) - -*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/16 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.11 [47.105695279989121] fbridge_mode=1 - [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.4435s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3157s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1278s for 90112 events => throughput is 7.05E+05 events/s - -*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (47.105695279989099) and cpp (47.105695279989121) differ by less than 3E-14 (4.440892098500626e-16) +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.336215e+05 ) sec^-1 -*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** +*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.913129e+05 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.976134e+05 ) sec^-1 - -*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/16 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.09 [47.094184803756640] fbridge_mode=1 - [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.3946s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3768s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0179s for 8192 events => throughput is 4.59E+05 events/s - -*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (47.094184803756640) and cpp (47.094184803756640) differ by less than 3E-14 (0.0) - -*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/16 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.11 [47.105695279989121] fbridge_mode=1 - [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.5213s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3206s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2007s for 90112 events => throughput is 4.49E+05 events/s - -*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (47.105695279989099) and cpp (47.105695279989121) differ by less than 3E-14 (4.440892098500626e-16) - -*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.486588e+05 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.259643e+05 ) sec^-1 +*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -505,8 +357,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggtt_x1_cudacpp > /tmp/valassia/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -514,13 +366,13 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.09 [47.094184803756640] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.7886s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7880s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0006s for 8192 events => throughput is 1.44E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.5915s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5908s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0008s for 8192 events => throughput is 1.09E+07 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.094184803756640) and cpp (47.094184803756640) differ by less than 3E-14 (0.0) +OK! xsec from fortran (47.094184803756626) and cpp (47.094184803756640) differ by less than 3E-14 (2.220446049250313e-16) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -538,8 +390,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggtt_x10_cudacpp > /tmp/valassia/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -547,56 +399,56 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.11 [47.105695279989121] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.7692s - [COUNTERS] Fortran Overhead ( 0 ) : 1.7627s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0066s for 90112 events => throughput is 1.37E+07 events/s + [COUNTERS] PROGRAM TOTAL : 1.3353s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3276s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0077s for 90112 events => throughput is 1.17E+07 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.105695279989099) and cpp (47.105695279989121) differ by less than 3E-14 (4.440892098500626e-16) +OK! xsec from fortran (47.105695279989114) and cpp (47.105695279989121) differ by less than 3E-14 (2.220446049250313e-16) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.023940e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.337786e+06 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.691972e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.034768e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.997330e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.781902e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.067310e+08 ) sec^-1 +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.755662e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.002923e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.780759e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.151958e+08 ) sec^-1 +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.956840e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.012784e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.756228e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.073379e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.141575e+07 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt index 4666126254..f04e38a4a5 100644 --- a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +Working directory (build): /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx CUDACPP_BUILDDIR='.' - make USEBUILDDIR=1 AVX=none - make USEBUILDDIR=1 AVX=sse4 +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + make USEBUILDDIR=1 AVX=avx2 + make USEBUILDDIR=1 AVX=512y +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' OMP_NUM_THREADS= -DATE: 2024-02-02_17:30:53 +DATE: 2024-02-08_19:36:52 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +Working directory (run): /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,18 +50,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/avalassi/output_ggtt_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggtt_x1_fortran > /tmp/valassia/output_ggtt_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.09 [47.094184803756640] fbridge_mode=0 + [XSECTION] Cross section = 47.09 [47.094184803756626] fbridge_mode=0 [UNWEIGHT] Wrote 2601 events (found 5405 events) - [COUNTERS] PROGRAM TOTAL : 0.7788s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7367s - [COUNTERS] Fortran MEs ( 1 ) : 0.0420s for 8192 events => throughput is 1.95E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.5890s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5604s + [COUNTERS] Fortran MEs ( 1 ) : 0.0285s for 8192 events => throughput is 2.87E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,18 +75,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/avalassi/output_ggtt_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggtt_x1_fortran > /tmp/valassia/output_ggtt_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.09 [47.094184803756640] fbridge_mode=0 + [XSECTION] Cross section = 47.09 [47.094184803756626] fbridge_mode=0 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.4031s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3605s - [COUNTERS] Fortran MEs ( 1 ) : 0.0426s for 8192 events => throughput is 1.92E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3196s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2910s + [COUNTERS] Fortran MEs ( 1 ) : 0.0286s for 8192 events => throughput is 2.87E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,18 +100,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x10_fortran > /tmp/avalassi/output_ggtt_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggtt_x10_fortran > /tmp/valassia/output_ggtt_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.11 [47.105695279989099] fbridge_mode=0 + [XSECTION] Cross section = 47.11 [47.105695279989114] fbridge_mode=0 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.7981s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3372s - [COUNTERS] Fortran MEs ( 1 ) : 0.4610s for 90112 events => throughput is 1.95E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.3449s + [COUNTERS] Fortran Overhead ( 0 ) : 1.0324s + [COUNTERS] Fortran MEs ( 1 ) : 0.3125s for 90112 events => throughput is 2.88E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,22 +125,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x1_cudacpp > /tmp/valassia/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.09 [47.094177233089695] fbridge_mode=1 + [XSECTION] Cross section = 47.09 [47.094178241446492] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.4279s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3927s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0352s for 8192 events => throughput is 2.33E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3448s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3173s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0275s for 8192 events => throughput is 2.98E+05 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.094184803756640) and cpp (47.094177233089695) differ by less than 4E-4 (1.6075587627728538e-07) +OK! xsec from fortran (47.094184803756626) and cpp (47.094178241446492) differ by less than 4E-4 (1.3934438314322506e-07) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -158,36 +158,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x10_cudacpp > /tmp/valassia/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.11 [47.105686104543288] fbridge_mode=1 + [XSECTION] Cross section = 47.11 [47.105686930681671] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.7255s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3394s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3860s for 90112 events => throughput is 2.33E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.3550s + [COUNTERS] Fortran Overhead ( 0 ) : 1.0532s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3018s for 90112 events => throughput is 2.99E+05 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.105695279989099) and cpp (47.105686104543288) differ by less than 4E-4 (1.9478421364738097e-07) +OK! xsec from fortran (47.105695279989114) and cpp (47.105686930681671) differ by less than 4E-4 (1.7724624157278157e-07) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.371890e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.108953e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.337034e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.106693e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,22 +201,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x1_cudacpp > /tmp/valassia/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.09 [47.094173275857273] fbridge_mode=1 + [XSECTION] Cross section = 47.09 [47.094176373190514] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.3882s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3741s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0141s for 8192 events => throughput is 5.81E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3155s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3027s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0128s for 8192 events => throughput is 6.39E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.094184803756640) and cpp (47.094173275857273) differ by less than 4E-4 (2.447839242414318e-07) +OK! xsec from fortran (47.094184803756626) and cpp (47.094176373190514) differ by less than 4E-4 (1.7901501314643298e-07) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -234,36 +234,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x10_cudacpp > /tmp/valassia/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.11 [47.105682058834830] fbridge_mode=1 + [XSECTION] Cross section = 47.11 [47.105685173093654] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.4967s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3339s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1629s for 90112 events => throughput is 5.53E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.1800s + [COUNTERS] Fortran Overhead ( 0 ) : 1.0392s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1408s for 90112 events => throughput is 6.40E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.105695279989099) and cpp (47.105682058834830) differ by less than 4E-4 (2.8066997403985994e-07) +OK! xsec from fortran (47.105695279989114) and cpp (47.105685173093654) differ by less than 4E-4 (2.1455782361901043e-07) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.542889e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.477802e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.561027e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.774895e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,22 +277,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x1_cudacpp > /tmp/valassia/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.09 [47.094171343713690] fbridge_mode=1 + [XSECTION] Cross section = 47.09 [47.094174474272364] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.3707s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3629s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0078s for 8192 events => throughput is 1.05E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3055s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2992s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0062s for 8192 events => throughput is 1.32E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.094184803756640) and cpp (47.094171343713690) differ by less than 4E-4 (2.8581114641657024e-07) +OK! xsec from fortran (47.094184803756626) and cpp (47.094174474272364) differ by less than 4E-4 (2.1933672500473733e-07) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -310,188 +310,40 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x10_cudacpp > /tmp/valassia/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.11 [47.105681519092386] fbridge_mode=1 + [XSECTION] Cross section = 47.11 [47.105684585116684] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.3902s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3040s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0862s for 90112 events => throughput is 1.05E+06 events/s + [COUNTERS] PROGRAM TOTAL : 1.1036s + [COUNTERS] Fortran Overhead ( 0 ) : 1.0352s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0685s for 90112 events => throughput is 1.32E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.105695279989099) and cpp (47.105681519092386) differ by less than 4E-4 (2.9212808838607884e-07) +OK! xsec from fortran (47.105695279989114) and cpp (47.105684585116684) differ by less than 4E-4 (2.2703990176786704e-07) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.020810e+06 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.022011e+06 ) sec^-1 - -*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/16 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.09 [47.094171343713690] fbridge_mode=1 - [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.3726s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3652s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0074s for 8192 events => throughput is 1.11E+06 events/s - -*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (47.094184803756640) and cpp (47.094171343713690) differ by less than 4E-4 (2.8581114641657024e-07) - -*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/16 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.11 [47.105681519092386] fbridge_mode=1 - [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.3847s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3058s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0789s for 90112 events => throughput is 1.14E+06 events/s - -*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (47.105695279989099) and cpp (47.105681519092386) differ by less than 4E-4 (2.9212808838607884e-07) - -*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.130747e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.372076e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.140611e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.377066e+06 ) sec^-1 -*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/16 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.09 [47.094178385820562] fbridge_mode=1 - [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.3793s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3687s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0106s for 8192 events => throughput is 7.75E+05 events/s - -*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (47.094184803756640) and cpp (47.094178385820562) differ by less than 4E-4 (1.3627873807209312e-07) - -*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical +*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** -*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/16 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.11 [47.105688391077187] fbridge_mode=1 - [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.4295s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3171s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1125s for 90112 events => throughput is 8.01E+05 events/s - -*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (47.105695279989099) and cpp (47.105688391077187) differ by less than 4E-4 (1.46243715803962e-07) - -*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.801539e+05 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.630491e+05 ) sec^-1 +*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -505,22 +357,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggtt_x1_cudacpp > /tmp/valassia/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.09 [47.094184344050284] fbridge_mode=1 + [XSECTION] Cross section = 47.09 [47.094176770070867] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.7836s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7831s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0005s for 8192 events => throughput is 1.52E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.5789s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5785s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0003s for 8192 events => throughput is 2.34E+07 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.094184803756640) and cpp (47.094184344050284) differ by less than 4E-4 (9.761425112664313e-09) +OK! xsec from fortran (47.094184803756626) and cpp (47.094176770070867) differ by less than 4E-4 (1.705876382374072e-07) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -538,65 +390,65 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggtt_x10_cudacpp > /tmp/valassia/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.11 [47.105694586476879] fbridge_mode=1 + [XSECTION] Cross section = 47.11 [47.105687115703695] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.7290s - [COUNTERS] Fortran Overhead ( 0 ) : 1.7235s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0055s for 90112 events => throughput is 1.65E+07 events/s + [COUNTERS] PROGRAM TOTAL : 1.3547s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3510s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0037s for 90112 events => throughput is 2.42E+07 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.105695279989099) and cpp (47.105694586476879) differ by less than 4E-4 (1.4722470687011935e-08) +OK! xsec from fortran (47.105695279989114) and cpp (47.105687115703695) differ by less than 4E-4 (1.733184357144424e-07) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.208020e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.899433e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.852460e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.182152e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.818533e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.088571e+08 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.761615e+08 ) sec^-1 +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.029875e+08 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.755961e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.086952e+08 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.851789e+08 ) sec^-1 +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.117274e+08 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.356500e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.307240e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.372719e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.512417e+07 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt index db0e6484e4..69de55f839 100644 --- a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +Working directory (build): /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx CUDACPP_BUILDDIR='.' - make USEBUILDDIR=1 AVX=none -make USEBUILDDIR=1 AVX=sse4 +make USEBUILDDIR=1 AVX=sse4 +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 + make USEBUILDDIR=1 AVX=512y +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' OMP_NUM_THREADS= -DATE: 2024-02-02_17:31:20 +DATE: 2024-02-08_19:37:18 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +Working directory (run): /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,18 +50,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/avalassi/output_ggtt_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggtt_x1_fortran > /tmp/valassia/output_ggtt_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.09 [47.094184803756640] fbridge_mode=0 + [XSECTION] Cross section = 47.09 [47.094184803756626] fbridge_mode=0 [UNWEIGHT] Wrote 2601 events (found 5405 events) - [COUNTERS] PROGRAM TOTAL : 0.7877s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7462s - [COUNTERS] Fortran MEs ( 1 ) : 0.0415s for 8192 events => throughput is 1.98E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.6035s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5750s + [COUNTERS] Fortran MEs ( 1 ) : 0.0285s for 8192 events => throughput is 2.87E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,18 +75,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/avalassi/output_ggtt_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggtt_x1_fortran > /tmp/valassia/output_ggtt_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.09 [47.094184803756640] fbridge_mode=0 + [XSECTION] Cross section = 47.09 [47.094184803756626] fbridge_mode=0 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.4007s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3584s - [COUNTERS] Fortran MEs ( 1 ) : 0.0423s for 8192 events => throughput is 1.94E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3193s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2908s + [COUNTERS] Fortran MEs ( 1 ) : 0.0285s for 8192 events => throughput is 2.87E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,18 +100,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x10_fortran > /tmp/avalassi/output_ggtt_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggtt_x10_fortran > /tmp/valassia/output_ggtt_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.11 [47.105695279989099] fbridge_mode=0 + [XSECTION] Cross section = 47.11 [47.105695279989114] fbridge_mode=0 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.7971s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3381s - [COUNTERS] Fortran MEs ( 1 ) : 0.4590s for 90112 events => throughput is 1.96E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.3507s + [COUNTERS] Fortran Overhead ( 0 ) : 1.0370s + [COUNTERS] Fortran MEs ( 1 ) : 0.3137s for 90112 events => throughput is 2.87E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,22 +125,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x1_cudacpp > /tmp/valassia/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.09 [47.094186141863887] fbridge_mode=1 + [XSECTION] Cross section = 47.09 [47.094186141863901] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.4305s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3926s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0378s for 8192 events => throughput is 2.16E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3553s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3223s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0330s for 8192 events => throughput is 2.48E+05 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.094184803756640) and cpp (47.094186141863887) differ by less than 2E-4 (2.841342827686333e-08) +OK! xsec from fortran (47.094184803756626) and cpp (47.094186141863901) differ by less than 2E-4 (2.8413428942997143e-08) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -158,8 +158,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x10_cudacpp > /tmp/valassia/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -167,27 +167,27 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.11 [47.105696630006634] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.7590s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3410s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.4180s for 90112 events => throughput is 2.16E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.4240s + [COUNTERS] Fortran Overhead ( 0 ) : 1.0604s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3636s for 90112 events => throughput is 2.48E+05 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.105695279989099) and cpp (47.105696630006634) differ by less than 2E-4 (2.8659327133695456e-08) +OK! xsec from fortran (47.105695279989114) and cpp (47.105696630006634) differ by less than 2E-4 (2.865932691165085e-08) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.107850e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.553748e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.119243e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.547176e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,22 +201,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x1_cudacpp > /tmp/valassia/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.09 [47.094186141863887] fbridge_mode=1 + [XSECTION] Cross section = 47.09 [47.094186141863908] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.4038s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3829s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0209s for 8192 events => throughput is 3.92E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3266s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3094s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0172s for 8192 events => throughput is 4.77E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.094184803756640) and cpp (47.094186141863887) differ by less than 2E-4 (2.841342827686333e-08) +OK! xsec from fortran (47.094184803756626) and cpp (47.094186141863908) differ by less than 2E-4 (2.8413429165041748e-08) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -234,8 +234,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x10_cudacpp > /tmp/valassia/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -243,27 +243,27 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.11 [47.105696630006626] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.5521s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3202s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2319s for 90112 events => throughput is 3.89E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.2350s + [COUNTERS] Fortran Overhead ( 0 ) : 1.0459s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1891s for 90112 events => throughput is 4.77E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.105695279989099) and cpp (47.105696630006626) differ by less than 2E-4 (2.8659327133695456e-08) +OK! xsec from fortran (47.105695279989114) and cpp (47.105696630006626) differ by less than 2E-4 (2.8659326689606246e-08) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.729825e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.825990e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.890474e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.826297e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,22 +277,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x1_cudacpp > /tmp/valassia/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.09 [47.094186193208813] fbridge_mode=1 + [XSECTION] Cross section = 47.09 [47.094186193208834] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.3829s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3698s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0131s for 8192 events => throughput is 6.24E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3102s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3002s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0100s for 8192 events => throughput is 8.18E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.094184803756640) and cpp (47.094186193208813) differ by less than 2E-4 (2.950368882537191e-08) +OK! xsec from fortran (47.094184803756626) and cpp (47.094186193208834) differ by less than 2E-4 (2.9503689491505725e-08) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -310,188 +310,40 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x10_cudacpp > /tmp/valassia/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.11 [47.105696667630845] fbridge_mode=1 + [XSECTION] Cross section = 47.11 [47.105696667630852] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.4649s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3167s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1482s for 90112 events => throughput is 6.08E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.1577s + [COUNTERS] Fortran Overhead ( 0 ) : 1.0471s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1106s for 90112 events => throughput is 8.15E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.105695279989099) and cpp (47.105696667630845) differ by less than 2E-4 (2.945804622456194e-08) +OK! xsec from fortran (47.105695279989114) and cpp (47.105696667630852) differ by less than 2E-4 (2.9458046002517335e-08) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.981718e+05 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.042172e+05 ) sec^-1 - -*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/16 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.09 [47.094186193208813] fbridge_mode=1 - [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.3832s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3711s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0121s for 8192 events => throughput is 6.77E+05 events/s - -*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (47.094184803756640) and cpp (47.094186193208813) differ by less than 2E-4 (2.950368882537191e-08) - -*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/16 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.11 [47.105696667630845] fbridge_mode=1 - [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.4364s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3095s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1269s for 90112 events => throughput is 7.10E+05 events/s - -*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (47.105695279989099) and cpp (47.105696667630845) differ by less than 2E-4 (2.945804622456194e-08) - -*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.023502e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.430058e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.000168e+05 ) sec^-1 - -*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/16 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.09 [47.094186193208813] fbridge_mode=1 - [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.3908s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3733s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0175s for 8192 events => throughput is 4.69E+05 events/s - -*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (47.094184803756640) and cpp (47.094186193208813) differ by less than 2E-4 (2.950368882537191e-08) - -*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.499253e+05 ) sec^-1 -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical +*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** -*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/16 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.11 [47.105696667630845] fbridge_mode=1 - [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.5047s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3145s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1901s for 90112 events => throughput is 4.74E+05 events/s - -*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (47.105695279989099) and cpp (47.105696667630845) differ by less than 2E-4 (2.945804622456194e-08) - -*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.481828e+05 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.450695e+05 ) sec^-1 +*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -505,22 +357,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggtt_x1_cudacpp > /tmp/valassia/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.09 [47.094184798437830] fbridge_mode=1 + [XSECTION] Cross section = 47.09 [47.094184798437837] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.7843s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7837s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0006s for 8192 events => throughput is 1.46E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.5762s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5755s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.12E+07 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.094184803756640) and cpp (47.094184798437830) differ by less than 2E-4 (1.1293987967064822e-10) +OK! xsec from fortran (47.094184803756626) and cpp (47.094184798437837) differ by less than 2E-4 (1.1293943558143837e-10) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -538,8 +390,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggtt_x10_cudacpp > /tmp/valassia/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -547,56 +399,56 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.11 [47.105695279068492] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.7271s - [COUNTERS] Fortran Overhead ( 0 ) : 1.7206s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0064s for 90112 events => throughput is 1.40E+07 events/s + [COUNTERS] PROGRAM TOTAL : 1.3190s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3117s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0074s for 90112 events => throughput is 1.22E+07 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.105695279989099) and cpp (47.105695279068492) differ by less than 2E-4 (1.9543477947081556e-11) +OK! xsec from fortran (47.105695279989114) and cpp (47.105695279068492) differ by less than 2E-4 (1.954369999168648e-11) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.038186e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.387307e+06 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.737217e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.036460e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.003045e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.800149e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.059626e+08 ) sec^-1 +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.799801e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.009453e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.805211e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.141818e+08 ) sec^-1 +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.006027e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.992170e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.776932e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.011561e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.160521e+07 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt index d7bf492fa9..31848fff27 100644 --- a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +Working directory (build): /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg CUDACPP_BUILDDIR='.' - - make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' + make USEBUILDDIR=1 AVX=avx2 + make USEBUILDDIR=1 AVX=512y +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' OMP_NUM_THREADS= -DATE: 2024-02-02_17:31:48 +DATE: 2024-02-08_19:37:44 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +Working directory (run): /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,18 +50,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/avalassi/output_ggttg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggttg_x1_fortran > /tmp/valassia/output_ggttg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.1011 [0.10112748607749111] fbridge_mode=0 + [XSECTION] Cross section = 0.1011 [0.10112317668354764] fbridge_mode=0 [UNWEIGHT] Wrote 365 events (found 1496 events) - [COUNTERS] PROGRAM TOTAL : 0.6900s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3570s - [COUNTERS] Fortran MEs ( 1 ) : 0.3330s for 8192 events => throughput is 2.46E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.5805s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3781s + [COUNTERS] Fortran MEs ( 1 ) : 0.2024s for 8192 events => throughput is 4.05E+04 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,18 +75,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/avalassi/output_ggttg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggttg_x1_fortran > /tmp/valassia/output_ggttg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.1011 [0.10112748607749111] fbridge_mode=0 + [XSECTION] Cross section = 0.1011 [0.10112317668354764] fbridge_mode=0 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.6528s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3200s - [COUNTERS] Fortran MEs ( 1 ) : 0.3328s for 8192 events => throughput is 2.46E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.4585s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2560s + [COUNTERS] Fortran MEs ( 1 ) : 0.2025s for 8192 events => throughput is 4.04E+04 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,18 +100,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x10_fortran > /tmp/avalassi/output_ggttg_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggttg_x10_fortran > /tmp/valassia/output_ggttg_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07924 [7.9238481932717722E-002] fbridge_mode=0 - [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 5.2594s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5679s - [COUNTERS] Fortran MEs ( 1 ) : 3.6915s for 90112 events => throughput is 2.44E+04 events/s + [XSECTION] Cross section = 0.07924 [7.9239236471252555E-002] fbridge_mode=0 + [UNWEIGHT] Wrote 1899 events (found 1904 events) + [COUNTERS] PROGRAM TOTAL : 3.4300s + [COUNTERS] Fortran Overhead ( 0 ) : 1.1993s + [COUNTERS] Fortran MEs ( 1 ) : 2.2307s for 90112 events => throughput is 4.04E+04 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,22 +125,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x1_cudacpp > /tmp/valassia/output_ggttg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.1011 [0.10112748607749111] fbridge_mode=1 + [XSECTION] Cross section = 0.1011 [0.10112317668354763] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.9620s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6342s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3278s for 8192 events => throughput is 2.50E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.8274s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5459s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2815s for 8192 events => throughput is 2.91E+04 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.10112748607749111) and cpp (0.10112748607749111) differ by less than 3E-14 (0.0) +OK! xsec from fortran (0.10112317668354764) and cpp (0.10112317668354763) differ by less than 3E-14 (1.1102230246251565e-16) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -158,36 +158,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x10_cudacpp > /tmp/valassia/output_ggttg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07924 [7.9238481932717694E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 5.4901s - [COUNTERS] Fortran Overhead ( 0 ) : 1.8765s - [COUNTERS] CudaCpp MEs ( 2 ) : 3.6136s for 90112 events => throughput is 2.49E+04 events/s + [XSECTION] Cross section = 0.07924 [7.9239236471252514E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1899 events (found 1904 events) + [COUNTERS] PROGRAM TOTAL : 4.5819s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4840s + [COUNTERS] CudaCpp MEs ( 2 ) : 3.0979s for 90112 events => throughput is 2.91E+04 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238481932717694E-002) differ by less than 3E-14 (3.3306690738754696e-16) +OK! xsec from fortran (7.9239236471252555E-002) and cpp (7.9239236471252514E-002) differ by less than 3E-14 (5.551115123125783e-16) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.575356e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.996227e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.592675e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.988570e+04 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,22 +201,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x1_cudacpp > /tmp/valassia/output_ggttg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.1011 [0.10112748607748863] fbridge_mode=1 + [XSECTION] Cross section = 0.1011 [0.10112317668354515] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.6477s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4806s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1671s for 8192 events => throughput is 4.90E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.5676s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4325s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1351s for 8192 events => throughput is 6.06E+04 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.10112748607749111) and cpp (0.10112748607748863) differ by less than 3E-14 (2.453592884421596e-14) +OK! xsec from fortran (0.10112317668354764) and cpp (0.10112317668354515) differ by less than 3E-14 (2.475797344914099e-14) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -234,36 +234,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x10_cudacpp > /tmp/valassia/output_ggttg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07924 [7.9238481932717680E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 3.5650s - [COUNTERS] Fortran Overhead ( 0 ) : 1.7077s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.8573s for 90112 events => throughput is 4.85E+04 events/s + [XSECTION] Cross section = 0.07924 [7.9239236471252514E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1899 events (found 1904 events) + [COUNTERS] PROGRAM TOTAL : 2.8126s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3366s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.4760s for 90112 events => throughput is 6.11E+04 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238481932717680E-002) differ by less than 3E-14 (5.551115123125783e-16) +OK! xsec from fortran (7.9239236471252555E-002) and cpp (7.9239236471252514E-002) differ by less than 3E-14 (5.551115123125783e-16) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.776387e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.158217e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.984656e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.162344e+04 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,22 +277,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x1_cudacpp > /tmp/valassia/output_ggttg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.1011 [0.10112748607749110] fbridge_mode=1 + [XSECTION] Cross section = 0.1011 [0.10112317668354763] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.4871s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4012s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0859s for 8192 events => throughput is 9.53E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.3964s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3297s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0667s for 8192 events => throughput is 1.23E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.10112748607749111) and cpp (0.10112748607749110) differ by less than 3E-14 (1.1102230246251565e-16) +OK! xsec from fortran (0.10112317668354764) and cpp (0.10112317668354763) differ by less than 3E-14 (1.1102230246251565e-16) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -310,188 +310,40 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x10_cudacpp > /tmp/valassia/output_ggttg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07924 [7.9238481932717722E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 2.5694s - [COUNTERS] Fortran Overhead ( 0 ) : 1.6233s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.9461s for 90112 events => throughput is 9.52E+04 events/s + [XSECTION] Cross section = 0.07924 [7.9239236471252555E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1899 events (found 1904 events) + [COUNTERS] PROGRAM TOTAL : 2.0050s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2715s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.7335s for 90112 events => throughput is 1.23E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238481932717722E-002) differ by less than 3E-14 (0.0) +OK! xsec from fortran (7.9239236471252555E-002) and cpp (7.9239236471252555E-002) differ by less than 3E-14 (0.0) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.714492e+04 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.747130e+04 ) sec^-1 - -*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 32/32 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.1011 [0.10112748607749110] fbridge_mode=1 - [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.4628s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3883s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0745s for 8192 events => throughput is 1.10E+05 events/s - -*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (0.10112748607749111) and cpp (0.10112748607749110) differ by less than 3E-14 (1.1102230246251565e-16) - -*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 32/32 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07924 [7.9238481932717722E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 2.4465s - [COUNTERS] Fortran Overhead ( 0 ) : 1.6165s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.8300s for 90112 events => throughput is 1.09E+05 events/s - -*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238481932717722E-002) differ by less than 3E-14 (0.0) - -*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.107337e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.271134e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.124769e+05 ) sec^-1 - -*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 32/32 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.1011 [0.10112748607749110] fbridge_mode=1 - [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.5193s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4167s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1026s for 8192 events => throughput is 7.98E+04 events/s - -*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (0.10112748607749111) and cpp (0.10112748607749110) differ by less than 3E-14 (1.1102230246251565e-16) +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.276682e+05 ) sec^-1 -*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** +*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 32/32 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07924 [7.9238481932717722E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 2.8468s - [COUNTERS] Fortran Overhead ( 0 ) : 1.6753s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.1715s for 90112 events => throughput is 7.69E+04 events/s - -*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238481932717722E-002) differ by less than 3E-14 (0.0) - -*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.059547e+04 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.140399e+04 ) sec^-1 +*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -505,22 +357,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggttg_x1_cudacpp > /tmp/valassia/output_ggttg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.1011 [0.10112748607749110] fbridge_mode=1 + [XSECTION] Cross section = 0.1011 [0.10112317668354760] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.7571s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7517s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0055s for 8192 events => throughput is 1.50E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.5698s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5623s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0075s for 8192 events => throughput is 1.09E+06 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.10112748607749111) and cpp (0.10112748607749110) differ by less than 3E-14 (1.1102230246251565e-16) +OK! xsec from fortran (0.10112317668354764) and cpp (0.10112317668354760) differ by less than 3E-14 (4.440892098500626e-16) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -538,65 +390,65 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggttg_x10_cudacpp > /tmp/valassia/output_ggttg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07924 [7.9238481932717722E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 2.0046s - [COUNTERS] Fortran Overhead ( 0 ) : 1.9818s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0228s for 90112 events => throughput is 3.95E+06 events/s + [XSECTION] Cross section = 0.07924 [7.9239236471252555E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1899 events (found 1904 events) + [COUNTERS] PROGRAM TOTAL : 1.5839s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5014s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0825s for 90112 events => throughput is 1.09E+06 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238481932717722E-002) differ by less than 3E-14 (0.0) +OK! xsec from fortran (7.9239236471252555E-002) and cpp (7.9239236471252555E-002) differ by less than 3E-14 (0.0) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.614946e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.136030e+06 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.052412e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.166900e+06 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.667267e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.675509e+06 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.245582e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.305503e+06 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.700625e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.674372e+06 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.256386e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.839205e+06 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.675204e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.662393e+06 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.765286e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.405471e+05 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt index 850026c210..60199f2caa 100644 --- a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +Working directory (build): /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg CUDACPP_BUILDDIR='.' - make USEBUILDDIR=1 AVX=none - make USEBUILDDIR=1 AVX=sse4 +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' + make USEBUILDDIR=1 AVX=avx2 + make USEBUILDDIR=1 AVX=512y +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' OMP_NUM_THREADS= -DATE: 2024-02-02_17:32:32 +DATE: 2024-02-08_19:38:22 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +Working directory (run): /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,18 +50,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/avalassi/output_ggttg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggttg_x1_fortran > /tmp/valassia/output_ggttg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.1011 [0.10112748607749111] fbridge_mode=0 + [XSECTION] Cross section = 0.1011 [0.10112317668354764] fbridge_mode=0 [UNWEIGHT] Wrote 365 events (found 1496 events) - [COUNTERS] PROGRAM TOTAL : 0.6835s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3525s - [COUNTERS] Fortran MEs ( 1 ) : 0.3310s for 8192 events => throughput is 2.47E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.4836s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2813s + [COUNTERS] Fortran MEs ( 1 ) : 0.2023s for 8192 events => throughput is 4.05E+04 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,18 +75,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/avalassi/output_ggttg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggttg_x1_fortran > /tmp/valassia/output_ggttg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.1011 [0.10112748607749111] fbridge_mode=0 + [XSECTION] Cross section = 0.1011 [0.10112317668354764] fbridge_mode=0 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.6461s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3151s - [COUNTERS] Fortran MEs ( 1 ) : 0.3311s for 8192 events => throughput is 2.47E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.4597s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2574s + [COUNTERS] Fortran MEs ( 1 ) : 0.2023s for 8192 events => throughput is 4.05E+04 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,18 +100,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x10_fortran > /tmp/avalassi/output_ggttg_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggttg_x10_fortran > /tmp/valassia/output_ggttg_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07924 [7.9238481932717722E-002] fbridge_mode=0 - [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 5.1895s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5476s - [COUNTERS] Fortran MEs ( 1 ) : 3.6419s for 90112 events => throughput is 2.47E+04 events/s + [XSECTION] Cross section = 0.07924 [7.9239236471252555E-002] fbridge_mode=0 + [UNWEIGHT] Wrote 1899 events (found 1904 events) + [COUNTERS] PROGRAM TOTAL : 3.4250s + [COUNTERS] Fortran Overhead ( 0 ) : 1.1992s + [COUNTERS] Fortran MEs ( 1 ) : 2.2257s for 90112 events => throughput is 4.05E+04 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,22 +125,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x1_cudacpp > /tmp/valassia/output_ggttg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.1011 [0.10112722327776243] fbridge_mode=1 + [XSECTION] Cross section = 0.1011 [0.10112291597608296] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.9189s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6145s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3044s for 8192 events => throughput is 2.69E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.7637s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5081s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2556s for 8192 events => throughput is 3.21E+04 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.10112748607749111) and cpp (0.10112722327776243) differ by less than 4E-4 (2.5986973362090993e-06) +OK! xsec from fortran (0.10112317668354764) and cpp (0.10112291597608296) differ by less than 4E-4 (2.5781178285555484e-06) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -158,36 +158,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x10_cudacpp > /tmp/valassia/output_ggttg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07924 [7.9238466406484034E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 5.2049s - [COUNTERS] Fortran Overhead ( 0 ) : 1.8432s - [COUNTERS] CudaCpp MEs ( 2 ) : 3.3616s for 90112 events => throughput is 2.68E+04 events/s + [XSECTION] Cross section = 0.07924 [7.9239221732791437E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1899 events (found 1904 events) + [COUNTERS] PROGRAM TOTAL : 4.2737s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4606s + [COUNTERS] CudaCpp MEs ( 2 ) : 2.8131s for 90112 events => throughput is 3.20E+04 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238466406484034E-002) differ by less than 4E-4 (1.9594309874637617e-07) +OK! xsec from fortran (7.9239236471252555E-002) and cpp (7.9239221732791437E-002) differ by less than 4E-4 (1.8599953477416165e-07) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.717227e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.314907e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.771768e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.317579e+04 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,22 +201,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x1_cudacpp > /tmp/valassia/output_ggttg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.1011 [0.10112720218188545] fbridge_mode=1 + [XSECTION] Cross section = 0.1011 [0.10112290421591680] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.4989s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4063s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0926s for 8192 events => throughput is 8.84E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.4097s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3346s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0751s for 8192 events => throughput is 1.09E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.10112748607749111) and cpp (0.10112720218188545) differ by less than 4E-4 (2.8073040938547678e-06) +OK! xsec from fortran (0.10112317668354764) and cpp (0.10112290421591680) differ by less than 4E-4 (2.6944132867079418e-06) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -234,36 +234,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x10_cudacpp > /tmp/valassia/output_ggttg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07924 [7.9238450523404405E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 2.6549s - [COUNTERS] Fortran Overhead ( 0 ) : 1.6338s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.0211s for 90112 events => throughput is 8.82E+04 events/s + [XSECTION] Cross section = 0.07924 [7.9239212368085274E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1899 events (found 1904 events) + [COUNTERS] PROGRAM TOTAL : 2.1080s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2816s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.8264s for 90112 events => throughput is 1.09E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238450523404405E-002) differ by less than 4E-4 (3.9638963988952725e-07) +OK! xsec from fortran (7.9239236471252555E-002) and cpp (7.9239212368085274E-002) differ by less than 4E-4 (3.0418222529693395e-07) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.919079e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.104565e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.882000e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.091624e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,22 +277,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x1_cudacpp > /tmp/valassia/output_ggttg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.1011 [0.10112721286411488] fbridge_mode=1 + [XSECTION] Cross section = 0.1011 [0.10112291415112837] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.4022s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3588s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0434s for 8192 events => throughput is 1.89E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3273s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2932s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0341s for 8192 events => throughput is 2.41E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.10112748607749111) and cpp (0.10112721286411488) differ by less than 4E-4 (2.701672777827291e-06) +OK! xsec from fortran (0.10112317668354764) and cpp (0.10112291415112837) differ by less than 4E-4 (2.5961646764605106e-06) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -310,188 +310,40 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x10_cudacpp > /tmp/valassia/output_ggttg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07924 [7.9238449434208005E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 2.0893s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5973s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.4920s for 90112 events => throughput is 1.83E+05 events/s + [XSECTION] Cross section = 0.07924 [7.9239211617250407E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1899 events (found 1904 events) + [COUNTERS] PROGRAM TOTAL : 1.6161s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2416s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3745s for 90112 events => throughput is 2.41E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238449434208005E-002) differ by less than 4E-4 (4.101354408314606e-07) +OK! xsec from fortran (7.9239236471252555E-002) and cpp (7.9239211617250407E-002) differ by less than 4E-4 (3.136577692020026e-07) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.828999e+05 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.792882e+05 ) sec^-1 - -*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 32/32 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.1011 [0.10112721286411488] fbridge_mode=1 - [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.4095s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3702s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0393s for 8192 events => throughput is 2.08E+05 events/s - -*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (0.10112748607749111) and cpp (0.10112721286411488) differ by less than 4E-4 (2.701672777827291e-06) - -*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 32/32 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07924 [7.9238449434208005E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 2.0849s - [COUNTERS] Fortran Overhead ( 0 ) : 1.6397s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.4452s for 90112 events => throughput is 2.02E+05 events/s - -*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238449434208005E-002) differ by less than 4E-4 (4.101354408314606e-07) - -*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.132865e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.461538e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.123758e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.467933e+05 ) sec^-1 -*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 32/32 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.1011 [0.10112723411062496] fbridge_mode=1 - [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.4209s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3695s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0514s for 8192 events => throughput is 1.59E+05 events/s - -*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** +*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** -OK! xsec from fortran (0.10112748607749111) and cpp (0.10112723411062496) differ by less than 4E-4 (2.491576483576452e-06) - -*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 32/32 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07924 [7.9238464401552092E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 2.1490s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5892s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.5598s for 90112 events => throughput is 1.61E+05 events/s - -*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238464401552092E-002) differ by less than 4E-4 (2.2124560195013743e-07) - -*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.599197e+05 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.590579e+05 ) sec^-1 +*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -505,22 +357,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggttg_x1_cudacpp > /tmp/valassia/output_ggttg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.1011 [0.10112726034625695] fbridge_mode=1 + [XSECTION] Cross section = 0.1011 [0.10112292787307366] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.7479s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7470s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0009s for 8192 events => throughput is 9.63E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.5719s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5699s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0020s for 8192 events => throughput is 4.03E+06 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.10112748607749111) and cpp (0.10112726034625695) differ by less than 4E-4 (2.2321452151086163e-06) +OK! xsec from fortran (0.10112317668354764) and cpp (0.10112292787307366) differ by less than 4E-4 (2.4604693221741414e-06) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -538,65 +390,65 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggttg_x10_cudacpp > /tmp/valassia/output_ggttg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07924 [7.9238473828077680E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 2.0049s - [COUNTERS] Fortran Overhead ( 0 ) : 1.9947s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0101s for 90112 events => throughput is 8.90E+06 events/s + [XSECTION] Cross section = 0.07924 [7.9239222545537072E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1899 events (found 1904 events) + [COUNTERS] PROGRAM TOTAL : 1.5254s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5013s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0241s for 90112 events => throughput is 3.74E+06 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238473828077680E-002) differ by less than 4E-4 (1.0228161673175862e-07) +OK! xsec from fortran (7.9239236471252555E-002) and cpp (7.9239222545537072E-002) differ by less than 4E-4 (1.7574267630049434e-07) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.293156e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.797913e+06 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.820202e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.631443e+06 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.658079e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.467315e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.423098e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.086873e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.660659e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.473805e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.542756e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.637719e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.518430e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.424130e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.626408e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.261612e+06 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt index 71fcdf8259..d544006f8b 100644 --- a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +Working directory (build): /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg CUDACPP_BUILDDIR='.' make USEBUILDDIR=1 AVX=none - - make USEBUILDDIR=1 AVX=sse4 +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' + make USEBUILDDIR=1 AVX=avx2 + make USEBUILDDIR=1 AVX=512y +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' OMP_NUM_THREADS= -DATE: 2024-02-02_17:33:11 +DATE: 2024-02-08_19:38:58 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +Working directory (run): /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,18 +50,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/avalassi/output_ggttg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggttg_x1_fortran > /tmp/valassia/output_ggttg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.1011 [0.10112748607749111] fbridge_mode=0 + [XSECTION] Cross section = 0.1011 [0.10112317668354764] fbridge_mode=0 [UNWEIGHT] Wrote 365 events (found 1496 events) - [COUNTERS] PROGRAM TOTAL : 0.6918s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3604s - [COUNTERS] Fortran MEs ( 1 ) : 0.3314s for 8192 events => throughput is 2.47E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.4837s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2813s + [COUNTERS] Fortran MEs ( 1 ) : 0.2024s for 8192 events => throughput is 4.05E+04 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,18 +75,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/avalassi/output_ggttg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggttg_x1_fortran > /tmp/valassia/output_ggttg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.1011 [0.10112748607749111] fbridge_mode=0 + [XSECTION] Cross section = 0.1011 [0.10112317668354764] fbridge_mode=0 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.6484s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3162s - [COUNTERS] Fortran MEs ( 1 ) : 0.3322s for 8192 events => throughput is 2.47E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.4602s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2576s + [COUNTERS] Fortran MEs ( 1 ) : 0.2026s for 8192 events => throughput is 4.04E+04 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,18 +100,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x10_fortran > /tmp/avalassi/output_ggttg_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggttg_x10_fortran > /tmp/valassia/output_ggttg_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07924 [7.9238481932717722E-002] fbridge_mode=0 - [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 5.2111s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5576s - [COUNTERS] Fortran MEs ( 1 ) : 3.6535s for 90112 events => throughput is 2.47E+04 events/s + [XSECTION] Cross section = 0.07924 [7.9239236471252555E-002] fbridge_mode=0 + [UNWEIGHT] Wrote 1899 events (found 1904 events) + [COUNTERS] PROGRAM TOTAL : 3.4295s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2025s + [COUNTERS] Fortran MEs ( 1 ) : 2.2270s for 90112 events => throughput is 4.05E+04 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,22 +125,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x1_cudacpp > /tmp/valassia/output_ggttg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.1011 [0.10112748700702684] fbridge_mode=1 + [XSECTION] Cross section = 0.1011 [0.10112317761225882] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.9785s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6448s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3337s for 8192 events => throughput is 2.45E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.8252s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5389s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2863s for 8192 events => throughput is 2.86E+04 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.10112748607749111) and cpp (0.10112748700702684) differ by less than 2E-4 (9.191721828116783e-09) +OK! xsec from fortran (0.10112317668354764) and cpp (0.10112317761225882) differ by less than 2E-4 (9.183959592817814e-09) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -158,36 +158,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x10_cudacpp > /tmp/valassia/output_ggttg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07924 [7.9238482679400354E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 5.5580s - [COUNTERS] Fortran Overhead ( 0 ) : 1.8749s - [COUNTERS] CudaCpp MEs ( 2 ) : 3.6831s for 90112 events => throughput is 2.45E+04 events/s + [XSECTION] Cross section = 0.07924 [7.9239237217958461E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1899 events (found 1904 events) + [COUNTERS] PROGRAM TOTAL : 4.6463s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4911s + [COUNTERS] CudaCpp MEs ( 2 ) : 3.1552s for 90112 events => throughput is 2.86E+04 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238482679400354E-002) differ by less than 2E-4 (9.423232416594374e-09) +OK! xsec from fortran (7.9239236471252555E-002) and cpp (7.9239237217958461E-002) differ by less than 2E-4 (9.4234364755863e-09) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.523895e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.920474e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.507399e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.924422e+04 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,22 +201,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x1_cudacpp > /tmp/valassia/output_ggttg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.1011 [0.10112748702805031] fbridge_mode=1 + [XSECTION] Cross section = 0.1011 [0.10112317763556192] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.6835s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5055s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1781s for 8192 events => throughput is 4.60E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.5249s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3897s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1352s for 8192 events => throughput is 6.06E+04 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.10112748607749111) and cpp (0.10112748702805031) differ by less than 2E-4 (9.399612643790078e-09) +OK! xsec from fortran (0.10112317668354764) and cpp (0.10112317763556192) differ by less than 2E-4 (9.41440236879032e-09) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -234,36 +234,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x10_cudacpp > /tmp/valassia/output_ggttg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07924 [7.9238482683055653E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 3.5651s - [COUNTERS] Fortran Overhead ( 0 ) : 1.7062s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.8589s for 90112 events => throughput is 4.85E+04 events/s + [XSECTION] Cross section = 0.07924 [7.9239237221421968E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1899 events (found 1904 events) + [COUNTERS] PROGRAM TOTAL : 2.8268s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3392s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.4876s for 90112 events => throughput is 6.06E+04 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238482683055653E-002) differ by less than 2E-4 (9.469362849401364e-09) +OK! xsec from fortran (7.9239236471252555E-002) and cpp (7.9239237221421968E-002) differ by less than 2E-4 (9.467145956065792e-09) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.929939e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.219081e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.994581e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.226616e+04 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,22 +277,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x1_cudacpp > /tmp/valassia/output_ggttg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.1011 [0.10112748681415583] fbridge_mode=1 + [XSECTION] Cross section = 0.1011 [0.10112317741957558] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.4852s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3991s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0861s for 8192 events => throughput is 9.51E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.3914s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3258s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0656s for 8192 events => throughput is 1.25E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.10112748607749111) and cpp (0.10112748681415583) differ by less than 2E-4 (7.284515213257237e-09) +OK! xsec from fortran (0.10112317668354764) and cpp (0.10112317741957558) differ by less than 2E-4 (7.278528668663853e-09) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -310,188 +310,40 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x10_cudacpp > /tmp/valassia/output_ggttg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07924 [7.9238482534347218E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 2.5931s - [COUNTERS] Fortran Overhead ( 0 ) : 1.6378s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.9552s for 90112 events => throughput is 9.43E+04 events/s + [XSECTION] Cross section = 0.07924 [7.9239237072275287E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1899 events (found 1904 events) + [COUNTERS] PROGRAM TOTAL : 1.9923s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2703s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.7219s for 90112 events => throughput is 1.25E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238482534347218E-002) differ by less than 2E-4 (7.592642958798024e-09) +OK! xsec from fortran (7.9239236471252555E-002) and cpp (7.9239237072275287E-002) differ by less than 2E-4 (7.584913142011374e-09) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.734725e+04 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.745363e+04 ) sec^-1 - -*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 32/32 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.1011 [0.10112748681415583] fbridge_mode=1 - [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.4660s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3918s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0742s for 8192 events => throughput is 1.10E+05 events/s - -*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (0.10112748607749111) and cpp (0.10112748681415583) differ by less than 2E-4 (7.284515213257237e-09) - -*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 32/32 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07924 [7.9238482534347218E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 2.4328s - [COUNTERS] Fortran Overhead ( 0 ) : 1.6147s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.8181s for 90112 events => throughput is 1.10E+05 events/s - -*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238482534347218E-002) differ by less than 2E-4 (7.592642958798024e-09) - -*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.135224e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.280720e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.126026e+05 ) sec^-1 - -*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 32/32 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.1011 [0.10112748681415583] fbridge_mode=1 - [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.5246s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4211s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1035s for 8192 events => throughput is 7.91E+04 events/s - -*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (0.10112748607749111) and cpp (0.10112748681415583) differ by less than 2E-4 (7.284515213257237e-09) - -*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.274185e+05 ) sec^-1 -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 32/32 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07924 [7.9238482534347218E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 2.8202s - [COUNTERS] Fortran Overhead ( 0 ) : 1.6565s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.1637s for 90112 events => throughput is 7.74E+04 events/s - -*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238482534347218E-002) differ by less than 2E-4 (7.592642958798024e-09) +*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** -*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.784318e+04 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.783835e+04 ) sec^-1 +*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -505,22 +357,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggttg_x1_cudacpp > /tmp/valassia/output_ggttg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.1011 [0.10112748601943165] fbridge_mode=1 + [XSECTION] Cross section = 0.1011 [0.10112317662375726] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.7584s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7529s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0054s for 8192 events => throughput is 1.50E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.5841s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5766s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0075s for 8192 events => throughput is 1.09E+06 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.10112748607749111) and cpp (0.10112748601943165) differ by less than 2E-4 (5.74121417074025e-10) +OK! xsec from fortran (0.10112317668354764) and cpp (0.10112317662375726) differ by less than 2E-4 (5.9126292750733e-10) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -538,65 +390,65 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggttg_x10_cudacpp > /tmp/valassia/output_ggttg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07924 [7.9238481937154381E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 1.9920s - [COUNTERS] Fortran Overhead ( 0 ) : 1.9690s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0229s for 90112 events => throughput is 3.93E+06 events/s + [XSECTION] Cross section = 0.07924 [7.9239236476482192E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1899 events (found 1904 events) + [COUNTERS] PROGRAM TOTAL : 1.5790s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4964s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0826s for 90112 events => throughput is 1.09E+06 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238481937154381E-002) differ by less than 2E-4 (5.5991211667105745e-11) +OK! xsec from fortran (7.9239236471252555E-002) and cpp (7.9239236476482192E-002) differ by less than 2E-4 (6.599809587726213e-11) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.619289e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.139394e+06 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.193860e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.153627e+06 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.607364e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.672492e+06 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.233488e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.306451e+06 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.623886e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.681234e+06 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.244403e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.840455e+06 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.628865e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.665696e+06 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.718930e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.397776e+05 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt index 6a4dc45af4..bc6d179012 100644 --- a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +Working directory (build): /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg CUDACPP_BUILDDIR='.' - - make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + make USEBUILDDIR=1 AVX=avx2 + make USEBUILDDIR=1 AVX=512y +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' OMP_NUM_THREADS= -DATE: 2024-02-02_17:33:56 +DATE: 2024-02-08_19:39:36 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +Working directory (run): /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,18 +50,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/avalassi/output_ggttgg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggttgg_x1_fortran > /tmp/valassia/output_ggttgg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8703612510102345E-004] fbridge_mode=0 + [XSECTION] Cross section = 0.000387 [3.8704143122579712E-004] fbridge_mode=0 [UNWEIGHT] Wrote 62 events (found 950 events) - [COUNTERS] PROGRAM TOTAL : 4.6196s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3556s - [COUNTERS] Fortran MEs ( 1 ) : 4.2640s for 8192 events => throughput is 1.92E+03 events/s + [COUNTERS] PROGRAM TOTAL : 2.8828s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3925s + [COUNTERS] Fortran MEs ( 1 ) : 2.4903s for 8192 events => throughput is 3.29E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,18 +75,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/avalassi/output_ggttgg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggttgg_x1_fortran > /tmp/valassia/output_ggttgg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8703612510102345E-004] fbridge_mode=0 + [XSECTION] Cross section = 0.000387 [3.8704143122579712E-004] fbridge_mode=0 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 4.6525s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3443s - [COUNTERS] Fortran MEs ( 1 ) : 4.3081s for 8192 events => throughput is 1.90E+03 events/s + [COUNTERS] PROGRAM TOTAL : 2.7811s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2733s + [COUNTERS] Fortran MEs ( 1 ) : 2.5077s for 8192 events => throughput is 3.27E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,18 +100,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x10_fortran > /tmp/avalassi/output_ggttgg_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggttgg_x10_fortran > /tmp/valassia/output_ggttgg_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0001579 [1.5793438642451704E-004] fbridge_mode=0 - [UNWEIGHT] Wrote 1361 events (found 1881 events) - [COUNTERS] PROGRAM TOTAL : 48.6942s - [COUNTERS] Fortran Overhead ( 0 ) : 2.0853s - [COUNTERS] Fortran MEs ( 1 ) : 46.6089s for 90112 events => throughput is 1.93E+03 events/s + [XSECTION] Cross section = 0.0001579 [1.5793532411914656E-004] fbridge_mode=0 + [UNWEIGHT] Wrote 1358 events (found 1880 events) + [COUNTERS] PROGRAM TOTAL : 28.9805s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5848s + [COUNTERS] Fortran MEs ( 1 ) : 27.3957s for 90112 events => throughput is 3.29E+03 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,22 +125,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x1_cudacpp > /tmp/valassia/output_ggttgg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8703612510102372E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.000387 [3.8704143122579739E-004] fbridge_mode=1 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 9.2433s - [COUNTERS] Fortran Overhead ( 0 ) : 4.7573s - [COUNTERS] CudaCpp MEs ( 2 ) : 4.4860s for 8192 events => throughput is 1.83E+03 events/s + [COUNTERS] PROGRAM TOTAL : 7.7782s + [COUNTERS] Fortran Overhead ( 0 ) : 4.0129s + [COUNTERS] CudaCpp MEs ( 2 ) : 3.7654s for 8192 events => throughput is 2.18E+03 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.8703612510102345E-004) and cpp (3.8703612510102372E-004) differ by less than 3E-14 (6.661338147750939e-16) +OK! xsec from fortran (3.8704143122579712E-004) and cpp (3.8704143122579739E-004) differ by less than 3E-14 (6.661338147750939e-16) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -158,36 +158,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x10_cudacpp > /tmp/valassia/output_ggttgg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0001579 [1.5793438642451701E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 1361 events (found 1881 events) - [COUNTERS] PROGRAM TOTAL : 55.5057s - [COUNTERS] Fortran Overhead ( 0 ) : 6.4484s - [COUNTERS] CudaCpp MEs ( 2 ) : 49.0573s for 90112 events => throughput is 1.84E+03 events/s + [XSECTION] Cross section = 0.0001579 [1.5793532411914648E-004] fbridge_mode=1 + [UNWEIGHT] Wrote 1358 events (found 1880 events) + [COUNTERS] PROGRAM TOTAL : 46.7387s + [COUNTERS] Fortran Overhead ( 0 ) : 5.2958s + [COUNTERS] CudaCpp MEs ( 2 ) : 41.4429s for 90112 events => throughput is 2.17E+03 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5793438642451704E-004) and cpp (1.5793438642451701E-004) differ by less than 3E-14 (2.220446049250313e-16) +OK! xsec from fortran (1.5793532411914656E-004) and cpp (1.5793532411914648E-004) differ by less than 3E-14 (5.551115123125783e-16) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.873700e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.239534e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.885335e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.231355e+03 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,22 +201,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x1_cudacpp > /tmp/valassia/output_ggttgg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8703612510102372E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.000387 [3.8704143122579739E-004] fbridge_mode=1 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 4.8340s - [COUNTERS] Fortran Overhead ( 0 ) : 2.5435s - [COUNTERS] CudaCpp MEs ( 2 ) : 2.2906s for 8192 events => throughput is 3.58E+03 events/s + [COUNTERS] PROGRAM TOTAL : 3.5669s + [COUNTERS] Fortran Overhead ( 0 ) : 1.9185s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.6484s for 8192 events => throughput is 4.97E+03 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.8703612510102345E-004) and cpp (3.8703612510102372E-004) differ by less than 3E-14 (6.661338147750939e-16) +OK! xsec from fortran (3.8704143122579712E-004) and cpp (3.8704143122579739E-004) differ by less than 3E-14 (6.661338147750939e-16) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -234,36 +234,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x10_cudacpp > /tmp/valassia/output_ggttgg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0001579 [1.5793438642451704E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 1361 events (found 1881 events) - [COUNTERS] PROGRAM TOTAL : 29.4013s - [COUNTERS] Fortran Overhead ( 0 ) : 4.3570s - [COUNTERS] CudaCpp MEs ( 2 ) : 25.0443s for 90112 events => throughput is 3.60E+03 events/s + [XSECTION] Cross section = 0.0001579 [1.5793532411914648E-004] fbridge_mode=1 + [UNWEIGHT] Wrote 1358 events (found 1880 events) + [COUNTERS] PROGRAM TOTAL : 21.3966s + [COUNTERS] Fortran Overhead ( 0 ) : 3.2630s + [COUNTERS] CudaCpp MEs ( 2 ) : 18.1336s for 90112 events => throughput is 4.97E+03 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5793438642451704E-004) and cpp (1.5793438642451704E-004) differ by less than 3E-14 (0.0) +OK! xsec from fortran (1.5793532411914656E-004) and cpp (1.5793532411914648E-004) differ by less than 3E-14 (5.551115123125783e-16) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.765368e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.089371e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.779969e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.071916e+03 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,22 +277,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x1_cudacpp > /tmp/valassia/output_ggttgg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8703612510102367E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.000387 [3.8704143122579728E-004] fbridge_mode=1 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 2.3128s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3150s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.9977s for 8192 events => throughput is 8.21E+03 events/s + [COUNTERS] PROGRAM TOTAL : 1.6587s + [COUNTERS] Fortran Overhead ( 0 ) : 0.9637s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.6951s for 8192 events => throughput is 1.18E+04 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.8703612510102345E-004) and cpp (3.8703612510102367E-004) differ by less than 3E-14 (6.661338147750939e-16) +OK! xsec from fortran (3.8704143122579712E-004) and cpp (3.8704143122579728E-004) differ by less than 3E-14 (4.440892098500626e-16) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -310,188 +310,40 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x10_cudacpp > /tmp/valassia/output_ggttgg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0001579 [1.5793438642451707E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 1361 events (found 1881 events) - [COUNTERS] PROGRAM TOTAL : 14.1468s - [COUNTERS] Fortran Overhead ( 0 ) : 3.0979s - [COUNTERS] CudaCpp MEs ( 2 ) : 11.0490s for 90112 events => throughput is 8.16E+03 events/s + [XSECTION] Cross section = 0.0001579 [1.5793532411914648E-004] fbridge_mode=1 + [UNWEIGHT] Wrote 1358 events (found 1880 events) + [COUNTERS] PROGRAM TOTAL : 9.8396s + [COUNTERS] Fortran Overhead ( 0 ) : 2.2455s + [COUNTERS] CudaCpp MEs ( 2 ) : 7.5942s for 90112 events => throughput is 1.19E+04 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5793438642451704E-004) and cpp (1.5793438642451707E-004) differ by less than 3E-14 (2.220446049250313e-16) +OK! xsec from fortran (1.5793532411914656E-004) and cpp (1.5793532411914648E-004) differ by less than 3E-14 (5.551115123125783e-16) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.449952e+03 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.461993e+03 ) sec^-1 - -*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 64/64 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8703612510102367E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 2.0739s - [COUNTERS] Fortran Overhead ( 0 ) : 1.1991s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.8747s for 8192 events => throughput is 9.37E+03 events/s - -*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (3.8703612510102345E-004) and cpp (3.8703612510102367E-004) differ by less than 3E-14 (6.661338147750939e-16) - -*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 64/64 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0001579 [1.5793438642451707E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 1361 events (found 1881 events) - [COUNTERS] PROGRAM TOTAL : 12.6064s - [COUNTERS] Fortran Overhead ( 0 ) : 2.9572s - [COUNTERS] CudaCpp MEs ( 2 ) : 9.6491s for 90112 events => throughput is 9.34E+03 events/s - -*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (1.5793438642451704E-004) and cpp (1.5793438642451707E-004) differ by less than 3E-14 (2.220446049250313e-16) - -*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.073449e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.228414e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.069597e+03 ) sec^-1 - -*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 64/64 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8703612510102367E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 2.7659s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5584s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.2075s for 8192 events => throughput is 6.78E+03 events/s - -*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (3.8703612510102345E-004) and cpp (3.8703612510102367E-004) differ by less than 3E-14 (6.661338147750939e-16) - -*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.227429e+04 ) sec^-1 -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 64/64 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0001579 [1.5793438642451707E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 1361 events (found 1881 events) - [COUNTERS] PROGRAM TOTAL : 15.3694s - [COUNTERS] Fortran Overhead ( 0 ) : 3.3113s - [COUNTERS] CudaCpp MEs ( 2 ) : 12.0581s for 90112 events => throughput is 7.47E+03 events/s - -*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (1.5793438642451704E-004) and cpp (1.5793438642451707E-004) differ by less than 3E-14 (2.220446049250313e-16) +*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** -*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.595421e+03 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.619668e+03 ) sec^-1 +*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -505,22 +357,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggttgg_x1_cudacpp > /tmp/valassia/output_ggttgg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8703612510102367E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.000387 [3.8704143122579723E-004] fbridge_mode=1 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 0.8881s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8549s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0332s for 8192 events => throughput is 2.47E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.9692s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8564s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1127s for 8192 events => throughput is 7.27E+04 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.8703612510102345E-004) and cpp (3.8703612510102367E-004) differ by less than 3E-14 (6.661338147750939e-16) +OK! xsec from fortran (3.8704143122579712E-004) and cpp (3.8704143122579723E-004) differ by less than 3E-14 (2.220446049250313e-16) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -538,65 +390,65 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggttgg_x10_cudacpp > /tmp/valassia/output_ggttgg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0001579 [1.5793438642451712E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 1361 events (found 1881 events) - [COUNTERS] PROGRAM TOTAL : 2.9701s - [COUNTERS] Fortran Overhead ( 0 ) : 2.6073s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3629s for 90112 events => throughput is 2.48E+05 events/s + [XSECTION] Cross section = 0.0001579 [1.5793532411914653E-004] fbridge_mode=1 + [UNWEIGHT] Wrote 1358 events (found 1880 events) + [COUNTERS] PROGRAM TOTAL : 3.3032s + [COUNTERS] Fortran Overhead ( 0 ) : 2.0640s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.2391s for 90112 events => throughput is 7.27E+04 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5793438642451704E-004) and cpp (1.5793438642451712E-004) differ by less than 3E-14 (4.440892098500626e-16) +OK! xsec from fortran (1.5793532411914656E-004) and cpp (1.5793532411914653E-004) differ by less than 3E-14 (2.220446049250313e-16) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.288599e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.344224e+04 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.503409e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.505970e+04 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.106655e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.244620e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.164804e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.041653e+04 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.118423e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.247410e+05 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.155529e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.233850e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.101198e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.239643e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.429655e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.391120e+04 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt index 0ba4f800e0..7b61eaf73e 100644 --- a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +Working directory (build): /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg CUDACPP_BUILDDIR='.' make USEBUILDDIR=1 AVX=none - - make USEBUILDDIR=1 AVX=sse4 +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + make USEBUILDDIR=1 AVX=avx2 + make USEBUILDDIR=1 AVX=512y +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' OMP_NUM_THREADS= -DATE: 2024-02-02_17:38:17 +DATE: 2024-02-08_19:43:24 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +Working directory (run): /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,18 +50,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/avalassi/output_ggttgg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggttgg_x1_fortran > /tmp/valassia/output_ggttgg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8703612510102345E-004] fbridge_mode=0 + [XSECTION] Cross section = 0.000387 [3.8704143122579712E-004] fbridge_mode=0 [UNWEIGHT] Wrote 62 events (found 950 events) - [COUNTERS] PROGRAM TOTAL : 4.6400s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3441s - [COUNTERS] Fortran MEs ( 1 ) : 4.2960s for 8192 events => throughput is 1.91E+03 events/s + [COUNTERS] PROGRAM TOTAL : 2.7747s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2754s + [COUNTERS] Fortran MEs ( 1 ) : 2.4993s for 8192 events => throughput is 3.28E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,18 +75,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/avalassi/output_ggttgg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggttgg_x1_fortran > /tmp/valassia/output_ggttgg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8703612510102345E-004] fbridge_mode=0 + [XSECTION] Cross section = 0.000387 [3.8704143122579712E-004] fbridge_mode=0 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 4.5770s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3397s - [COUNTERS] Fortran MEs ( 1 ) : 4.2373s for 8192 events => throughput is 1.93E+03 events/s + [COUNTERS] PROGRAM TOTAL : 2.7635s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2744s + [COUNTERS] Fortran MEs ( 1 ) : 2.4891s for 8192 events => throughput is 3.29E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,18 +100,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x10_fortran > /tmp/avalassi/output_ggttgg_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggttgg_x10_fortran > /tmp/valassia/output_ggttgg_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0001579 [1.5793438642451704E-004] fbridge_mode=0 - [UNWEIGHT] Wrote 1361 events (found 1881 events) - [COUNTERS] PROGRAM TOTAL : 48.8126s - [COUNTERS] Fortran Overhead ( 0 ) : 2.0960s - [COUNTERS] Fortran MEs ( 1 ) : 46.7166s for 90112 events => throughput is 1.93E+03 events/s + [XSECTION] Cross section = 0.0001579 [1.5793532411914656E-004] fbridge_mode=0 + [UNWEIGHT] Wrote 1358 events (found 1880 events) + [COUNTERS] PROGRAM TOTAL : 29.1495s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5827s + [COUNTERS] Fortran MEs ( 1 ) : 27.5667s for 90112 events => throughput is 3.27E+03 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,22 +125,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x1_cudacpp > /tmp/valassia/output_ggttgg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8703728935895570E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.000387 [3.8704259755238570E-004] fbridge_mode=1 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 8.4004s - [COUNTERS] Fortran Overhead ( 0 ) : 4.3198s - [COUNTERS] CudaCpp MEs ( 2 ) : 4.0806s for 8192 events => throughput is 2.01E+03 events/s + [COUNTERS] PROGRAM TOTAL : 6.9384s + [COUNTERS] Fortran Overhead ( 0 ) : 3.5876s + [COUNTERS] CudaCpp MEs ( 2 ) : 3.3508s for 8192 events => throughput is 2.44E+03 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.8703612510102345E-004) and cpp (3.8703728935895570E-004) differ by less than 4E-4 (3.0081376303225937e-06) +OK! xsec from fortran (3.8704143122579712E-004) and cpp (3.8704259755238570E-004) differ by less than 4E-4 (3.0134411834747965e-06) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -158,36 +158,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x10_cudacpp > /tmp/valassia/output_ggttgg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0001579 [1.5793486223749466E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 1361 events (found 1881 events) - [COUNTERS] PROGRAM TOTAL : 51.1502s - [COUNTERS] Fortran Overhead ( 0 ) : 6.0928s - [COUNTERS] CudaCpp MEs ( 2 ) : 45.0574s for 90112 events => throughput is 2.00E+03 events/s + [XSECTION] Cross section = 0.0001579 [1.5793580182117605E-004] fbridge_mode=1 + [UNWEIGHT] Wrote 1358 events (found 1880 events) + [COUNTERS] PROGRAM TOTAL : 41.7441s + [COUNTERS] Fortran Overhead ( 0 ) : 4.8878s + [COUNTERS] CudaCpp MEs ( 2 ) : 36.8562s for 90112 events => throughput is 2.44E+03 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5793438642451704E-004) and cpp (1.5793486223749466E-004) differ by less than 4E-4 (3.0127256538392544e-06) +OK! xsec from fortran (1.5793532411914656E-004) and cpp (1.5793580182117605E-004) differ by less than 4E-4 (3.024668687290344e-06) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.059226e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.501242e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.060225e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.505866e+03 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,22 +201,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x1_cudacpp > /tmp/valassia/output_ggttgg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8703721162664038E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.000387 [3.8704254541054809E-004] fbridge_mode=1 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 2.5946s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4558s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.1387s for 8192 events => throughput is 7.19E+03 events/s + [COUNTERS] PROGRAM TOTAL : 1.9393s + [COUNTERS] Fortran Overhead ( 0 ) : 1.1027s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.8366s for 8192 events => throughput is 9.79E+03 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.8703612510102345E-004) and cpp (3.8703721162664038E-004) differ by less than 4E-4 (2.8072976823168005e-06) +OK! xsec from fortran (3.8704143122579712E-004) and cpp (3.8704254541054809E-004) differ by less than 4E-4 (2.8787221757475834e-06) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -234,36 +234,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x10_cudacpp > /tmp/valassia/output_ggttgg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0001579 [1.5793482900053113E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 1361 events (found 1881 events) - [COUNTERS] PROGRAM TOTAL : 15.8295s - [COUNTERS] Fortran Overhead ( 0 ) : 3.2375s - [COUNTERS] CudaCpp MEs ( 2 ) : 12.5920s for 90112 events => throughput is 7.16E+03 events/s + [XSECTION] Cross section = 0.0001579 [1.5793578161882866E-004] fbridge_mode=1 + [UNWEIGHT] Wrote 1358 events (found 1880 events) + [COUNTERS] PROGRAM TOTAL : 11.6401s + [COUNTERS] Fortran Overhead ( 0 ) : 2.3907s + [COUNTERS] CudaCpp MEs ( 2 ) : 9.2493s for 90112 events => throughput is 9.74E+03 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5793438642451704E-004) and cpp (1.5793482900053113E-004) differ by less than 4E-4 (2.8022777314173908e-06) +OK! xsec from fortran (1.5793532411914656E-004) and cpp (1.5793578161882866E-004) differ by less than 4E-4 (2.896753368286653e-06) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.405003e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.007784e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.316214e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.002571e+04 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,22 +277,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x1_cudacpp > /tmp/valassia/output_ggttgg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8703719746039955E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.000387 [3.8704254166302247E-004] fbridge_mode=1 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 1.3349s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8347s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.5002s for 8192 events => throughput is 1.64E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.9898s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6382s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3516s for 8192 events => throughput is 2.33E+04 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.8703612510102345E-004) and cpp (3.8703719746039955E-004) differ by less than 4E-4 (2.7706958254380964e-06) +OK! xsec from fortran (3.8704143122579712E-004) and cpp (3.8704254166302247E-004) differ by less than 4E-4 (2.8690396836061893e-06) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -310,188 +310,40 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x10_cudacpp > /tmp/valassia/output_ggttgg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0001579 [1.5793482744283897E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 1361 events (found 1881 events) - [COUNTERS] PROGRAM TOTAL : 8.1272s - [COUNTERS] Fortran Overhead ( 0 ) : 2.6060s - [COUNTERS] CudaCpp MEs ( 2 ) : 5.5212s for 90112 events => throughput is 1.63E+04 events/s + [XSECTION] Cross section = 0.0001579 [1.5793578009696313E-004] fbridge_mode=1 + [UNWEIGHT] Wrote 1358 events (found 1880 events) + [COUNTERS] PROGRAM TOTAL : 5.8017s + [COUNTERS] Fortran Overhead ( 0 ) : 1.9237s + [COUNTERS] CudaCpp MEs ( 2 ) : 3.8780s for 90112 events => throughput is 2.32E+04 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5793438642451704E-004) and cpp (1.5793482744283897E-004) differ by less than 4E-4 (2.7924148244817815e-06) +OK! xsec from fortran (1.5793532411914656E-004) and cpp (1.5793578009696313E-004) differ by less than 4E-4 (2.887117363403746e-06) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.676115e+04 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.670075e+04 ) sec^-1 - -*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 64/64 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8703719746039955E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 1.2197s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7780s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.4417s for 8192 events => throughput is 1.85E+04 events/s - -*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (3.8703612510102345E-004) and cpp (3.8703719746039955E-004) differ by less than 4E-4 (2.7706958254380964e-06) - -*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 64/64 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0001579 [1.5793482744283897E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 1361 events (found 1881 events) - [COUNTERS] PROGRAM TOTAL : 7.4850s - [COUNTERS] Fortran Overhead ( 0 ) : 2.5617s - [COUNTERS] CudaCpp MEs ( 2 ) : 4.9233s for 90112 events => throughput is 1.83E+04 events/s - -*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (1.5793438642451704E-004) and cpp (1.5793482744283897E-004) differ by less than 4E-4 (2.7924148244817815e-06) - -*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.912557e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.410213e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.902365e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.405195e+04 ) sec^-1 -*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 64/64 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8703728656142196E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 1.4235s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8824s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.5411s for 8192 events => throughput is 1.51E+04 events/s - -*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** +*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** -OK! xsec from fortran (3.8703612510102345E-004) and cpp (3.8703728656142196E-004) differ by less than 4E-4 (3.0009095357552695e-06) - -*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 64/64 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0001579 [1.5793486988396928E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 1361 events (found 1881 events) - [COUNTERS] PROGRAM TOTAL : 8.6665s - [COUNTERS] Fortran Overhead ( 0 ) : 2.6487s - [COUNTERS] CudaCpp MEs ( 2 ) : 6.0178s for 90112 events => throughput is 1.50E+04 events/s - -*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (1.5793438642451704E-004) and cpp (1.5793486988396928E-004) differ by less than 4E-4 (3.0611411687697654e-06) - -*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.514893e+04 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.518276e+04 ) sec^-1 +*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -505,22 +357,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggttgg_x1_cudacpp > /tmp/valassia/output_ggttgg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8703736267486325E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.000387 [3.8704261630635685E-004] fbridge_mode=1 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 0.8461s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8246s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0215s for 8192 events => throughput is 3.81E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.7970s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7415s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0555s for 8192 events => throughput is 1.48E+05 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.8703612510102345E-004) and cpp (3.8703736267486325E-004) differ by less than 4E-4 (3.197566737389579e-06) +OK! xsec from fortran (3.8704143122579712E-004) and cpp (3.8704261630635685E-004) differ by less than 4E-4 (3.0618958697381515e-06) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -538,65 +390,65 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggttgg_x10_cudacpp > /tmp/valassia/output_ggttgg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0001579 [1.5793489323670813E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 1361 events (found 1881 events) - [COUNTERS] PROGRAM TOTAL : 2.8329s - [COUNTERS] Fortran Overhead ( 0 ) : 2.5976s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2353s for 90112 events => throughput is 3.83E+05 events/s + [XSECTION] Cross section = 0.0001579 [1.5793580869662166E-004] fbridge_mode=1 + [UNWEIGHT] Wrote 1358 events (found 1880 events) + [COUNTERS] PROGRAM TOTAL : 2.5912s + [COUNTERS] Fortran Overhead ( 0 ) : 1.9808s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.6105s for 90112 events => throughput is 1.48E+05 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5793438642451704E-004) and cpp (1.5793489323670813E-004) differ by less than 4E-4 (3.2090047175081793e-06) +OK! xsec from fortran (1.5793532411914656E-004) and cpp (1.5793580869662166E-004) differ by less than 4E-4 (3.0682019858119247e-06) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.583913e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.480897e+05 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.920219e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.983179e+04 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.497718e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.715216e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.733443e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.319773e+05 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.472268e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.709955e+05 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.669692e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.074118e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.488266e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.705288e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.527343e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.430844e+04 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt index 13919dda4a..294c855cdc 100644 --- a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +Working directory (build): /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg CUDACPP_BUILDDIR='.' - make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 + make USEBUILDDIR=1 AVX=512y +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' OMP_NUM_THREADS= -DATE: 2024-02-02_17:41:41 +DATE: 2024-02-08_19:46:30 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +Working directory (run): /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,18 +50,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/avalassi/output_ggttgg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggttgg_x1_fortran > /tmp/valassia/output_ggttgg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8703612510102345E-004] fbridge_mode=0 + [XSECTION] Cross section = 0.000387 [3.8704143122579712E-004] fbridge_mode=0 [UNWEIGHT] Wrote 62 events (found 950 events) - [COUNTERS] PROGRAM TOTAL : 4.5810s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3447s - [COUNTERS] Fortran MEs ( 1 ) : 4.2362s for 8192 events => throughput is 1.93E+03 events/s + [COUNTERS] PROGRAM TOTAL : 2.7711s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2791s + [COUNTERS] Fortran MEs ( 1 ) : 2.4919s for 8192 events => throughput is 3.29E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,18 +75,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/avalassi/output_ggttgg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggttgg_x1_fortran > /tmp/valassia/output_ggttgg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8703612510102345E-004] fbridge_mode=0 + [XSECTION] Cross section = 0.000387 [3.8704143122579712E-004] fbridge_mode=0 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 4.6053s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3440s - [COUNTERS] Fortran MEs ( 1 ) : 4.2612s for 8192 events => throughput is 1.92E+03 events/s + [COUNTERS] PROGRAM TOTAL : 2.7717s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2746s + [COUNTERS] Fortran MEs ( 1 ) : 2.4971s for 8192 events => throughput is 3.28E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,18 +100,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x10_fortran > /tmp/avalassi/output_ggttgg_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggttgg_x10_fortran > /tmp/valassia/output_ggttgg_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0001579 [1.5793438642451704E-004] fbridge_mode=0 - [UNWEIGHT] Wrote 1361 events (found 1881 events) - [COUNTERS] PROGRAM TOTAL : 48.9066s - [COUNTERS] Fortran Overhead ( 0 ) : 2.1071s - [COUNTERS] Fortran MEs ( 1 ) : 46.7995s for 90112 events => throughput is 1.93E+03 events/s + [XSECTION] Cross section = 0.0001579 [1.5793532411914656E-004] fbridge_mode=0 + [UNWEIGHT] Wrote 1358 events (found 1880 events) + [COUNTERS] PROGRAM TOTAL : 29.0020s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5876s + [COUNTERS] Fortran MEs ( 1 ) : 27.4144s for 90112 events => throughput is 3.29E+03 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,22 +125,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x1_cudacpp > /tmp/valassia/output_ggttgg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8703612659176647E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.000387 [3.8704143272044121E-004] fbridge_mode=1 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 9.2947s - [COUNTERS] Fortran Overhead ( 0 ) : 4.7623s - [COUNTERS] CudaCpp MEs ( 2 ) : 4.5324s for 8192 events => throughput is 1.81E+03 events/s + [COUNTERS] PROGRAM TOTAL : 7.8731s + [COUNTERS] Fortran Overhead ( 0 ) : 4.0673s + [COUNTERS] CudaCpp MEs ( 2 ) : 3.8059s for 8192 events => throughput is 2.15E+03 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.8703612510102345E-004) and cpp (3.8703612659176647E-004) differ by less than 2E-4 (3.851689633904698e-09) +OK! xsec from fortran (3.8704143122579712E-004) and cpp (3.8704143272044121E-004) differ by less than 2E-4 (3.861716058040088e-09) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -158,36 +158,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x10_cudacpp > /tmp/valassia/output_ggttgg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0001579 [1.5793438704534937E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 1361 events (found 1881 events) - [COUNTERS] PROGRAM TOTAL : 56.6356s - [COUNTERS] Fortran Overhead ( 0 ) : 6.5504s - [COUNTERS] CudaCpp MEs ( 2 ) : 50.0852s for 90112 events => throughput is 1.80E+03 events/s + [XSECTION] Cross section = 0.0001579 [1.5793532474032691E-004] fbridge_mode=1 + [UNWEIGHT] Wrote 1358 events (found 1880 events) + [COUNTERS] PROGRAM TOTAL : 47.1946s + [COUNTERS] Fortran Overhead ( 0 ) : 5.3591s + [COUNTERS] CudaCpp MEs ( 2 ) : 41.8355s for 90112 events => throughput is 2.15E+03 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5793438642451704E-004) and cpp (1.5793438704534937E-004) differ by less than 2E-4 (3.930950898123342e-09) +OK! xsec from fortran (1.5793532411914656E-004) and cpp (1.5793532474032691E-004) differ by less than 2E-4 (3.933131154099101e-09) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.861679e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.178564e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.855599e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.190429e+03 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,22 +201,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x1_cudacpp > /tmp/valassia/output_ggttgg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8703612692816692E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.000387 [3.8704143304774347E-004] fbridge_mode=1 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 4.8767s - [COUNTERS] Fortran Overhead ( 0 ) : 2.5749s - [COUNTERS] CudaCpp MEs ( 2 ) : 2.3019s for 8192 events => throughput is 3.56E+03 events/s + [COUNTERS] PROGRAM TOTAL : 3.4953s + [COUNTERS] Fortran Overhead ( 0 ) : 1.8665s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.6288s for 8192 events => throughput is 5.03E+03 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.8703612510102345E-004) and cpp (3.8703612692816692E-004) differ by less than 2E-4 (4.720860369289426e-09) +OK! xsec from fortran (3.8704143122579712E-004) and cpp (3.8704143304774347E-004) differ by less than 2E-4 (4.707367828871156e-09) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -234,36 +234,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x10_cudacpp > /tmp/valassia/output_ggttgg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0001579 [1.5793438707226032E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 1361 events (found 1881 events) - [COUNTERS] PROGRAM TOTAL : 29.5553s - [COUNTERS] Fortran Overhead ( 0 ) : 4.4277s - [COUNTERS] CudaCpp MEs ( 2 ) : 25.1276s for 90112 events => throughput is 3.59E+03 events/s + [XSECTION] Cross section = 0.0001579 [1.5793532476698221E-004] fbridge_mode=1 + [UNWEIGHT] Wrote 1358 events (found 1880 events) + [COUNTERS] PROGRAM TOTAL : 21.1380s + [COUNTERS] Fortran Overhead ( 0 ) : 3.2192s + [COUNTERS] CudaCpp MEs ( 2 ) : 17.9188s for 90112 events => throughput is 5.03E+03 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5793438642451704E-004) and cpp (1.5793438707226032E-004) differ by less than 2E-4 (4.101344153184527e-09) +OK! xsec from fortran (1.5793532411914656E-004) and cpp (1.5793532476698221E-004) differ by less than 2E-4 (4.101904815811963e-09) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.686869e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.172254e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.692172e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.155646e+03 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,22 +277,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x1_cudacpp > /tmp/valassia/output_ggttgg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8703612675240507E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.000387 [3.8704143287857844E-004] fbridge_mode=1 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 2.3260s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3268s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.9991s for 8192 events => throughput is 8.20E+03 events/s + [COUNTERS] PROGRAM TOTAL : 1.6170s + [COUNTERS] Fortran Overhead ( 0 ) : 0.9367s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.6803s for 8192 events => throughput is 1.20E+04 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.8703612510102345E-004) and cpp (3.8703612675240507E-004) differ by less than 2E-4 (4.266737629876616e-09) +OK! xsec from fortran (3.8704143122579712E-004) and cpp (3.8704143287857844E-004) differ by less than 2E-4 (4.2702956726259345e-09) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -310,188 +310,40 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x10_cudacpp > /tmp/valassia/output_ggttgg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0001579 [1.5793438703631772E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 1361 events (found 1881 events) - [COUNTERS] PROGRAM TOTAL : 13.9915s - [COUNTERS] Fortran Overhead ( 0 ) : 3.0877s - [COUNTERS] CudaCpp MEs ( 2 ) : 10.9038s for 90112 events => throughput is 8.26E+03 events/s + [XSECTION] Cross section = 0.0001579 [1.5793532473043530E-004] fbridge_mode=1 + [UNWEIGHT] Wrote 1358 events (found 1880 events) + [COUNTERS] PROGRAM TOTAL : 9.7082s + [COUNTERS] Fortran Overhead ( 0 ) : 2.2389s + [COUNTERS] CudaCpp MEs ( 2 ) : 7.4693s for 90112 events => throughput is 1.21E+04 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5793438642451704E-004) and cpp (1.5793438703631772E-004) differ by less than 2E-4 (3.873764864437135e-09) +OK! xsec from fortran (1.5793532411914656E-004) and cpp (1.5793532473043530E-004) differ by less than 2E-4 (3.870500364655527e-09) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.461480e+03 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.508596e+03 ) sec^-1 - -*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 64/64 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8703612675240507E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 2.0588s - [COUNTERS] Fortran Overhead ( 0 ) : 1.1912s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.8676s for 8192 events => throughput is 9.44E+03 events/s - -*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (3.8703612510102345E-004) and cpp (3.8703612675240507E-004) differ by less than 2E-4 (4.266737629876616e-09) - -*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 64/64 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0001579 [1.5793438703631772E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 1361 events (found 1881 events) - [COUNTERS] PROGRAM TOTAL : 12.6716s - [COUNTERS] Fortran Overhead ( 0 ) : 2.9800s - [COUNTERS] CudaCpp MEs ( 2 ) : 9.6916s for 90112 events => throughput is 9.30E+03 events/s - -*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (1.5793438642451704E-004) and cpp (1.5793438703631772E-004) differ by less than 2E-4 (3.873764864437135e-09) - -*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.752793e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.222921e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.685901e+03 ) sec^-1 - -*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 64/64 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8703612675240507E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 2.5632s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4492s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.1140s for 8192 events => throughput is 7.35E+03 events/s - -*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (3.8703612510102345E-004) and cpp (3.8703612675240507E-004) differ by less than 2E-4 (4.266737629876616e-09) - -*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.236253e+04 ) sec^-1 -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 64/64 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0001579 [1.5793438703631772E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 1361 events (found 1881 events) - [COUNTERS] PROGRAM TOTAL : 15.3981s - [COUNTERS] Fortran Overhead ( 0 ) : 3.2041s - [COUNTERS] CudaCpp MEs ( 2 ) : 12.1939s for 90112 events => throughput is 7.39E+03 events/s - -*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (1.5793438642451704E-004) and cpp (1.5793438703631772E-004) differ by less than 2E-4 (3.873764864437135e-09) +*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** -*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.508701e+03 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.501942e+03 ) sec^-1 +*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -505,22 +357,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggttgg_x1_cudacpp > /tmp/valassia/output_ggttgg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8703612512203166E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.000387 [3.8704143124638075E-004] fbridge_mode=1 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 0.8869s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8539s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0330s for 8192 events => throughput is 2.48E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.8791s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7656s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1135s for 8192 events => throughput is 7.22E+04 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.8703612510102345E-004) and cpp (3.8703612512203166E-004) differ by less than 2E-4 (5.4279691852343603e-11) +OK! xsec from fortran (3.8704143122579712E-004) and cpp (3.8704143124638075E-004) differ by less than 2E-4 (5.318190332559425e-11) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -538,65 +390,65 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggttgg_x10_cudacpp > /tmp/valassia/output_ggttgg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0001579 [1.5793438642387715E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 1361 events (found 1881 events) - [COUNTERS] PROGRAM TOTAL : 2.9728s - [COUNTERS] Fortran Overhead ( 0 ) : 2.6099s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3629s for 90112 events => throughput is 2.48E+05 events/s + [XSECTION] Cross section = 0.0001579 [1.5793532411887058E-004] fbridge_mode=1 + [UNWEIGHT] Wrote 1358 events (found 1880 events) + [COUNTERS] PROGRAM TOTAL : 3.3145s + [COUNTERS] Fortran Overhead ( 0 ) : 2.0695s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.2450s for 90112 events => throughput is 7.24E+04 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5793438642451704E-004) and cpp (1.5793438642387715E-004) differ by less than 2E-4 (4.051647906067046e-12) +OK! xsec from fortran (1.5793532411914656E-004) and cpp (1.5793532411887058E-004) differ by less than 2E-4 (1.7474910407599964e-12) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.285287e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.287101e+04 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.523099e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.492219e+04 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.114422e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.245674e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.150176e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.022906e+04 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.108617e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.246083e+05 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.164425e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.231666e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.105220e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.243352e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.429812e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.379631e+04 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt index 0d455d9e11..885118dbe4 100644 --- a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +Working directory (build): /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg CUDACPP_BUILDDIR='.' - - make USEBUILDDIR=1 AVX=none + make USEBUILDDIR=1 AVX=sse4 +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=avx2 + make USEBUILDDIR=1 AVX=512y +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' OMP_NUM_THREADS= -DATE: 2024-02-02_17:47:36 +DATE: 2024-02-08_19:28:28 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +Working directory (run): /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,18 +50,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/avalassi/output_ggttggg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggttggg_x1_fortran > /tmp/valassia/output_ggttggg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.24e-06 [1.2403985227939176E-006] fbridge_mode=0 + [XSECTION] Cross section = 1.24e-06 [1.2403628942014972E-006] fbridge_mode=0 [UNWEIGHT] Wrote 1 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 97.5776s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5080s - [COUNTERS] Fortran MEs ( 1 ) : 97.0696s for 8192 events => throughput is 8.44E+01 events/s + [COUNTERS] PROGRAM TOTAL : 54.6661s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5009s + [COUNTERS] Fortran MEs ( 1 ) : 54.1652s for 8192 events => throughput is 1.51E+02 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,18 +75,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/avalassi/output_ggttggg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggttggg_x1_fortran > /tmp/valassia/output_ggttggg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.24e-06 [1.2403985227939176E-006] fbridge_mode=0 + [XSECTION] Cross section = 1.24e-06 [1.2403628942014972E-006] fbridge_mode=0 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 97.4990s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5088s - [COUNTERS] Fortran MEs ( 1 ) : 96.9903s for 8192 events => throughput is 8.45E+01 events/s + [COUNTERS] PROGRAM TOTAL : 54.6933s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4405s + [COUNTERS] Fortran MEs ( 1 ) : 54.2528s for 8192 events => throughput is 1.51E+02 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,18 +100,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x10_fortran > /tmp/avalassi/output_ggttggg_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggttggg_x10_fortran > /tmp/valassia/output_ggttggg_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.332e-07 [2.3322993086655972E-007] fbridge_mode=0 + [XSECTION] Cross section = 2.332e-07 [2.3322783648085419E-007] fbridge_mode=0 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 1072.0234s - [COUNTERS] Fortran Overhead ( 0 ) : 4.4573s - [COUNTERS] Fortran MEs ( 1 ) : 1067.5662s for 90112 events => throughput is 8.44E+01 events/s + [COUNTERS] PROGRAM TOTAL : 602.4668s + [COUNTERS] Fortran Overhead ( 0 ) : 3.1984s + [COUNTERS] Fortran MEs ( 1 ) : 599.2684s for 90112 events => throughput is 1.50E+02 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,22 +125,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x1_cudacpp > /tmp/valassia/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.24e-06 [1.2403985227939193E-006] fbridge_mode=1 + [XSECTION] Cross section = 1.24e-06 [1.2403628942015001E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 221.5798s - [COUNTERS] Fortran Overhead ( 0 ) : 99.1680s - [COUNTERS] CudaCpp MEs ( 2 ) : 122.4118s for 8192 events => throughput is 6.69E+01 events/s + [COUNTERS] PROGRAM TOTAL : 176.1915s + [COUNTERS] Fortran Overhead ( 0 ) : 80.7281s + [COUNTERS] CudaCpp MEs ( 2 ) : 95.4634s for 8192 events => throughput is 8.58E+01 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.2403985227939176E-006) and cpp (1.2403985227939193E-006) differ by less than 3E-14 (1.3322676295501878e-15) +OK! xsec from fortran (1.2403628942014972E-006) and cpp (1.2403628942015001E-006) differ by less than 3E-14 (2.4424906541753444e-15) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -158,36 +158,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x10_cudacpp > /tmp/valassia/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.332e-07 [2.3322993086656006E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.332e-07 [2.3322783648085453E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 1418.4252s - [COUNTERS] Fortran Overhead ( 0 ) : 102.1892s - [COUNTERS] CudaCpp MEs ( 2 ) : 1316.2360s for 90112 events => throughput is 6.85E+01 events/s + [COUNTERS] PROGRAM TOTAL : 1129.7650s + [COUNTERS] Fortran Overhead ( 0 ) : 82.4810s + [COUNTERS] CudaCpp MEs ( 2 ) : 1047.2841s for 90112 events => throughput is 8.60E+01 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.3322993086655972E-007) and cpp (2.3322993086656006E-007) differ by less than 3E-14 (1.5543122344752192e-15) +OK! xsec from fortran (2.3322783648085419E-007) and cpp (2.3322783648085453E-007) differ by less than 3E-14 (1.5543122344752192e-15) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.102951e+01 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.033504e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.199223e+01 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.037398e+02 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,22 +201,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x1_cudacpp > /tmp/valassia/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.24e-06 [1.2403985227939195E-006] fbridge_mode=1 + [XSECTION] Cross section = 1.24e-06 [1.2403628942015003E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 110.4001s - [COUNTERS] Fortran Overhead ( 0 ) : 50.9575s - [COUNTERS] CudaCpp MEs ( 2 ) : 59.4426s for 8192 events => throughput is 1.38E+02 events/s + [COUNTERS] PROGRAM TOTAL : 81.1983s + [COUNTERS] Fortran Overhead ( 0 ) : 36.6778s + [COUNTERS] CudaCpp MEs ( 2 ) : 44.5205s for 8192 events => throughput is 1.84E+02 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.2403985227939176E-006) and cpp (1.2403985227939195E-006) differ by less than 3E-14 (1.5543122344752192e-15) +OK! xsec from fortran (1.2403628942014972E-006) and cpp (1.2403628942015003E-006) differ by less than 3E-14 (2.6645352591003757e-15) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -234,36 +234,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x10_cudacpp > /tmp/valassia/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.332e-07 [2.3322993086656014E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.332e-07 [2.3322783648085448E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 717.4568s - [COUNTERS] Fortran Overhead ( 0 ) : 55.5343s - [COUNTERS] CudaCpp MEs ( 2 ) : 661.9225s for 90112 events => throughput is 1.36E+02 events/s + [COUNTERS] PROGRAM TOTAL : 529.9074s + [COUNTERS] Fortran Overhead ( 0 ) : 39.3238s + [COUNTERS] CudaCpp MEs ( 2 ) : 490.5836s for 90112 events => throughput is 1.84E+02 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.3322993086655972E-007) and cpp (2.3322993086656014E-007) differ by less than 3E-14 (1.7763568394002505e-15) +OK! xsec from fortran (2.3322783648085419E-007) and cpp (2.3322783648085448E-007) differ by less than 3E-14 (1.3322676295501878e-15) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.625359e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.262675e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.635897e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.273542e+02 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,22 +277,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x1_cudacpp > /tmp/valassia/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.24e-06 [1.2403985227939191E-006] fbridge_mode=1 + [XSECTION] Cross section = 1.24e-06 [1.2403628942015001E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 50.8754s - [COUNTERS] Fortran Overhead ( 0 ) : 23.7742s - [COUNTERS] CudaCpp MEs ( 2 ) : 27.1012s for 8192 events => throughput is 3.02E+02 events/s + [COUNTERS] PROGRAM TOTAL : 35.1935s + [COUNTERS] Fortran Overhead ( 0 ) : 16.0517s + [COUNTERS] CudaCpp MEs ( 2 ) : 19.1418s for 8192 events => throughput is 4.28E+02 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.2403985227939176E-006) and cpp (1.2403985227939191E-006) differ by less than 3E-14 (1.1102230246251565e-15) +OK! xsec from fortran (1.2403628942014972E-006) and cpp (1.2403628942015001E-006) differ by less than 3E-14 (2.4424906541753444e-15) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -310,188 +310,40 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x10_cudacpp > /tmp/valassia/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.332e-07 [2.3322993086656009E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.332e-07 [2.3322783648085445E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 327.5508s - [COUNTERS] Fortran Overhead ( 0 ) : 27.6050s - [COUNTERS] CudaCpp MEs ( 2 ) : 299.9458s for 90112 events => throughput is 3.00E+02 events/s + [COUNTERS] PROGRAM TOTAL : 228.7491s + [COUNTERS] Fortran Overhead ( 0 ) : 18.6295s + [COUNTERS] CudaCpp MEs ( 2 ) : 210.1196s for 90112 events => throughput is 4.29E+02 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.3322993086655972E-007) and cpp (2.3322993086656009E-007) differ by less than 3E-14 (1.5543122344752192e-15) +OK! xsec from fortran (2.3322783648085419E-007) and cpp (2.3322783648085445E-007) differ by less than 3E-14 (1.1102230246251565e-15) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.534698e+02 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.548065e+02 ) sec^-1 - -*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 128/128 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.24e-06 [1.2403985227939191E-006] fbridge_mode=1 - [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 45.2566s - [COUNTERS] Fortran Overhead ( 0 ) : 20.7876s - [COUNTERS] CudaCpp MEs ( 2 ) : 24.4690s for 8192 events => throughput is 3.35E+02 events/s - -*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (1.2403985227939176E-006) and cpp (1.2403985227939191E-006) differ by less than 3E-14 (1.1102230246251565e-15) - -*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 128/128 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.332e-07 [2.3322993086656009E-007] fbridge_mode=1 - [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 293.8217s - [COUNTERS] Fortran Overhead ( 0 ) : 24.8277s - [COUNTERS] CudaCpp MEs ( 2 ) : 268.9940s for 90112 events => throughput is 3.35E+02 events/s - -*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (2.3322993086655972E-007) and cpp (2.3322993086656009E-007) differ by less than 3E-14 (1.5543122344752192e-15) - -*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.054926e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.274856e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.031596e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.276171e+02 ) sec^-1 -*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 128/128 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.24e-06 [1.2403985227939191E-006] fbridge_mode=1 - [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 46.7263s - [COUNTERS] Fortran Overhead ( 0 ) : 23.0770s - [COUNTERS] CudaCpp MEs ( 2 ) : 23.6493s for 8192 events => throughput is 3.46E+02 events/s - -*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (1.2403985227939176E-006) and cpp (1.2403985227939191E-006) differ by less than 3E-14 (1.1102230246251565e-15) - -*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical +*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** -*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 128/128 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.332e-07 [2.3322993086656009E-007] fbridge_mode=1 - [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 288.7897s - [COUNTERS] Fortran Overhead ( 0 ) : 27.1152s - [COUNTERS] CudaCpp MEs ( 2 ) : 261.6745s for 90112 events => throughput is 3.44E+02 events/s - -*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (2.3322993086655972E-007) and cpp (2.3322993086656009E-007) differ by less than 3E-14 (1.5543122344752192e-15) - -*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.641032e+02 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.643252e+02 ) sec^-1 +*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -505,22 +357,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggttggg_x1_cudacpp > /tmp/valassia/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.24e-06 [1.2403985227939195E-006] fbridge_mode=1 + [XSECTION] Cross section = 1.24e-06 [1.2403628942015003E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 4.2603s - [COUNTERS] Fortran Overhead ( 0 ) : 3.1742s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.0862s for 8192 events => throughput is 7.54E+03 events/s + [COUNTERS] PROGRAM TOTAL : 11.5004s + [COUNTERS] Fortran Overhead ( 0 ) : 7.6961s + [COUNTERS] CudaCpp MEs ( 2 ) : 3.8043s for 8192 events => throughput is 2.15E+03 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.2403985227939176E-006) and cpp (1.2403985227939195E-006) differ by less than 3E-14 (1.5543122344752192e-15) +OK! xsec from fortran (1.2403628942014972E-006) and cpp (1.2403628942015003E-006) differ by less than 3E-14 (2.6645352591003757e-15) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -538,65 +390,65 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggttggg_x10_cudacpp > /tmp/valassia/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.332e-07 [2.3322993086656006E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.332e-07 [2.3322783648085437E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 19.0532s - [COUNTERS] Fortran Overhead ( 0 ) : 7.1340s - [COUNTERS] CudaCpp MEs ( 2 ) : 11.9192s for 90112 events => throughput is 7.56E+03 events/s + [COUNTERS] PROGRAM TOTAL : 51.8753s + [COUNTERS] Fortran Overhead ( 0 ) : 10.1190s + [COUNTERS] CudaCpp MEs ( 2 ) : 41.7563s for 90112 events => throughput is 2.16E+03 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.3322993086655972E-007) and cpp (2.3322993086656006E-007) differ by less than 3E-14 (1.5543122344752192e-15) +OK! xsec from fortran (2.3322783648085419E-007) and cpp (2.3322783648085437E-007) differ by less than 3E-14 (8.881784197001252e-16) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.520580e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.175030e+03 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.210159e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.229111e+03 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 512 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.224862e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.552785e+03 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 512 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.574822e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.462719e+03 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.231380e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.564096e+03 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.424808e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.521452e+03 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.241405e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.559456e+03 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.245841e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.123633e+03 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt index 5c1f32d186..70f4cabe46 100644 --- a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +Working directory (build): /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg CUDACPP_BUILDDIR='.' make USEBUILDDIR=1 AVX=none - make USEBUILDDIR=1 AVX=sse4 +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=avx2 + make USEBUILDDIR=1 AVX=512y +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' OMP_NUM_THREADS= -DATE: 2024-02-02_19:15:45 +DATE: 2024-02-08_20:23:49 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +Working directory (run): /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,18 +50,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/avalassi/output_ggttggg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggttggg_x1_fortran > /tmp/valassia/output_ggttggg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.24e-06 [1.2403985227939176E-006] fbridge_mode=0 + [XSECTION] Cross section = 1.24e-06 [1.2403628942014972E-006] fbridge_mode=0 [UNWEIGHT] Wrote 1 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 97.9258s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5031s - [COUNTERS] Fortran MEs ( 1 ) : 97.4228s for 8192 events => throughput is 8.41E+01 events/s + [COUNTERS] PROGRAM TOTAL : 54.4328s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3678s + [COUNTERS] Fortran MEs ( 1 ) : 54.0651s for 8192 events => throughput is 1.52E+02 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,18 +75,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/avalassi/output_ggttggg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggttggg_x1_fortran > /tmp/valassia/output_ggttggg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.24e-06 [1.2403985227939176E-006] fbridge_mode=0 + [XSECTION] Cross section = 1.24e-06 [1.2403628942014972E-006] fbridge_mode=0 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 97.4069s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5090s - [COUNTERS] Fortran MEs ( 1 ) : 96.8979s for 8192 events => throughput is 8.45E+01 events/s + [COUNTERS] PROGRAM TOTAL : 54.4664s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4348s + [COUNTERS] Fortran MEs ( 1 ) : 54.0315s for 8192 events => throughput is 1.52E+02 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,18 +100,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x10_fortran > /tmp/avalassi/output_ggttggg_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggttggg_x10_fortran > /tmp/valassia/output_ggttggg_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.332e-07 [2.3322993086655972E-007] fbridge_mode=0 + [XSECTION] Cross section = 2.332e-07 [2.3322783648085419E-007] fbridge_mode=0 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 1073.4860s - [COUNTERS] Fortran Overhead ( 0 ) : 4.4705s - [COUNTERS] Fortran MEs ( 1 ) : 1069.0155s for 90112 events => throughput is 8.43E+01 events/s + [COUNTERS] PROGRAM TOTAL : 598.2310s + [COUNTERS] Fortran Overhead ( 0 ) : 3.0769s + [COUNTERS] Fortran MEs ( 1 ) : 595.1541s for 90112 events => throughput is 1.51E+02 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,22 +125,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x1_cudacpp > /tmp/valassia/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.241e-06 [1.2405719498009764E-006] fbridge_mode=1 + [XSECTION] Cross section = 1.241e-06 [1.2405363572559468E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 197.4229s - [COUNTERS] Fortran Overhead ( 0 ) : 91.0913s - [COUNTERS] CudaCpp MEs ( 2 ) : 106.3316s for 8192 events => throughput is 7.70E+01 events/s + [COUNTERS] PROGRAM TOTAL : 161.0181s + [COUNTERS] Fortran Overhead ( 0 ) : 73.7447s + [COUNTERS] CudaCpp MEs ( 2 ) : 87.2734s for 8192 events => throughput is 9.39E+01 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.2403985227939176E-006) and cpp (1.2405719498009764E-006) differ by less than 4E-4 (0.00013981555433351112) +OK! xsec from fortran (1.2403628942014972E-006) and cpp (1.2405363572559468E-006) differ by less than 4E-4 (0.00013984863241267576) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -158,36 +158,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x10_cudacpp > /tmp/valassia/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.333e-07 [2.3326289850060011E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.333e-07 [2.3326080615569212E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 1261.2250s - [COUNTERS] Fortran Overhead ( 0 ) : 94.7599s - [COUNTERS] CudaCpp MEs ( 2 ) : 1166.4651s for 90112 events => throughput is 7.73E+01 events/s + [COUNTERS] PROGRAM TOTAL : 1039.3540s + [COUNTERS] Fortran Overhead ( 0 ) : 76.5015s + [COUNTERS] CudaCpp MEs ( 2 ) : 962.8525s for 90112 events => throughput is 9.36E+01 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.3322993086655972E-007) and cpp (2.3326289850060011E-007) differ by less than 4E-4 (0.00014135250101854346) +OK! xsec from fortran (2.3322783648085419E-007) and cpp (2.3326080615569212E-007) differ by less than 4E-4 (0.00014136252059526733) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.080943e+01 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.118262e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.069024e+01 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.120899e+02 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,22 +201,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x1_cudacpp > /tmp/valassia/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.241e-06 [1.2405716133562926E-006] fbridge_mode=1 + [XSECTION] Cross section = 1.241e-06 [1.2405361288903015E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 50.3586s - [COUNTERS] Fortran Overhead ( 0 ) : 23.7547s - [COUNTERS] CudaCpp MEs ( 2 ) : 26.6039s for 8192 events => throughput is 3.08E+02 events/s + [COUNTERS] PROGRAM TOTAL : 39.9885s + [COUNTERS] Fortran Overhead ( 0 ) : 18.0960s + [COUNTERS] CudaCpp MEs ( 2 ) : 21.8925s for 8192 events => throughput is 3.74E+02 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.2403985227939176E-006) and cpp (1.2405716133562926E-006) differ by less than 4E-4 (0.0001395443151488429) +OK! xsec from fortran (1.2403628942014972E-006) and cpp (1.2405361288903015E-006) differ by less than 4E-4 (0.0001396645204514435) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -234,36 +234,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x10_cudacpp > /tmp/valassia/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.333e-07 [2.3326283773234128E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.333e-07 [2.3326076878598447E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 320.2946s - [COUNTERS] Fortran Overhead ( 0 ) : 27.7541s - [COUNTERS] CudaCpp MEs ( 2 ) : 292.5406s for 90112 events => throughput is 3.08E+02 events/s + [COUNTERS] PROGRAM TOTAL : 253.5381s + [COUNTERS] Fortran Overhead ( 0 ) : 20.6367s + [COUNTERS] CudaCpp MEs ( 2 ) : 232.9014s for 90112 events => throughput is 3.87E+02 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.3322993086655972E-007) and cpp (2.3326283773234128E-007) differ by less than 4E-4 (0.00014109195015965525) +OK! xsec from fortran (2.3322783648085419E-007) and cpp (2.3326076878598447E-007) differ by less than 4E-4 (0.00014120229226155523) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.529766e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.660807e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.534003e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.664498e+02 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,22 +277,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x1_cudacpp > /tmp/valassia/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.241e-06 [1.2405715853898719E-006] fbridge_mode=1 + [XSECTION] Cross section = 1.241e-06 [1.2405360895331841E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 25.7271s - [COUNTERS] Fortran Overhead ( 0 ) : 12.1683s - [COUNTERS] CudaCpp MEs ( 2 ) : 13.5588s for 8192 events => throughput is 6.04E+02 events/s + [COUNTERS] PROGRAM TOTAL : 17.8143s + [COUNTERS] Fortran Overhead ( 0 ) : 8.1811s + [COUNTERS] CudaCpp MEs ( 2 ) : 9.6332s for 8192 events => throughput is 8.50E+02 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.2403985227939176E-006) and cpp (1.2405715853898719E-006) differ by less than 4E-4 (0.00013952176883003098) +OK! xsec from fortran (1.2403628942014972E-006) and cpp (1.2405360895331841E-006) differ by less than 4E-4 (0.00013963279012663143) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -310,188 +310,40 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x10_cudacpp > /tmp/valassia/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.333e-07 [2.3326275792962891E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.333e-07 [2.3326069099562333E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 165.2326s - [COUNTERS] Fortran Overhead ( 0 ) : 16.1362s - [COUNTERS] CudaCpp MEs ( 2 ) : 149.0964s for 90112 events => throughput is 6.04E+02 events/s + [COUNTERS] PROGRAM TOTAL : 115.7443s + [COUNTERS] Fortran Overhead ( 0 ) : 10.8101s + [COUNTERS] CudaCpp MEs ( 2 ) : 104.9342s for 90112 events => throughput is 8.59E+02 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.3322993086655972E-007) and cpp (2.3326275792962891E-007) differ by less than 4E-4 (0.00014074978690437057) +OK! xsec from fortran (2.3322783648085419E-007) and cpp (2.3326069099562333E-007) differ by less than 4E-4 (0.00014086875419705436) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.019714e+02 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.049492e+02 ) sec^-1 - -*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 128/128 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.241e-06 [1.2405715853898719E-006] fbridge_mode=1 - [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 22.8521s - [COUNTERS] Fortran Overhead ( 0 ) : 10.7252s - [COUNTERS] CudaCpp MEs ( 2 ) : 12.1268s for 8192 events => throughput is 6.76E+02 events/s - -*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (1.2403985227939176E-006) and cpp (1.2405715853898719E-006) differ by less than 4E-4 (0.00013952176883003098) - -*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 128/128 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.333e-07 [2.3326275792962891E-007] fbridge_mode=1 - [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 152.7981s - [COUNTERS] Fortran Overhead ( 0 ) : 14.7784s - [COUNTERS] CudaCpp MEs ( 2 ) : 138.0197s for 90112 events => throughput is 6.53E+02 events/s - -*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (2.3322993086655972E-007) and cpp (2.3326275792962891E-007) differ by less than 4E-4 (0.00014074978690437057) - -*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.650399e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.054826e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.713162e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.061844e+03 ) sec^-1 -*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 128/128 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.241e-06 [1.2405719423038986E-006] fbridge_mode=1 - [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 24.7046s - [COUNTERS] Fortran Overhead ( 0 ) : 12.1305s - [COUNTERS] CudaCpp MEs ( 2 ) : 12.5741s for 8192 events => throughput is 6.52E+02 events/s - -*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (1.2403985227939176E-006) and cpp (1.2405719423038986E-006) differ by less than 4E-4 (0.00013980951024539223) - -*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical +*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** -*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 128/128 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.333e-07 [2.3326283662420285E-007] fbridge_mode=1 - [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 151.9709s - [COUNTERS] Fortran Overhead ( 0 ) : 16.3494s - [COUNTERS] CudaCpp MEs ( 2 ) : 135.6214s for 90112 events => throughput is 6.64E+02 events/s - -*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (2.3322993086655972E-007) and cpp (2.3326283662420285E-007) differ by less than 4E-4 (0.00014108719888938914) - -*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.299667e+02 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.263869e+02 ) sec^-1 +*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -505,22 +357,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggttggg_x1_cudacpp > /tmp/valassia/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.241e-06 [1.2405722175509506E-006] fbridge_mode=1 + [XSECTION] Cross section = 1.241e-06 [1.2405363557292459E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 2.5146s - [COUNTERS] Fortran Overhead ( 0 ) : 2.0229s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.4917s for 8192 events => throughput is 1.67E+04 events/s + [COUNTERS] PROGRAM TOTAL : 6.0668s + [COUNTERS] Fortran Overhead ( 0 ) : 4.2571s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.8097s for 8192 events => throughput is 4.53E+03 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.2403985227939176E-006) and cpp (1.2405722175509506E-006) differ by less than 4E-4 (0.00014003141235763295) +OK! xsec from fortran (1.2403628942014972E-006) and cpp (1.2405363557292459E-006) differ by less than 4E-4 (0.00013984740156258724) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -538,65 +390,65 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggttggg_x10_cudacpp > /tmp/valassia/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.333e-07 [2.3326296967941821E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.333e-07 [2.3326074784076956E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 11.3635s - [COUNTERS] Fortran Overhead ( 0 ) : 5.9535s - [COUNTERS] CudaCpp MEs ( 2 ) : 5.4100s for 90112 events => throughput is 1.67E+04 events/s + [COUNTERS] PROGRAM TOTAL : 26.6359s + [COUNTERS] Fortran Overhead ( 0 ) : 6.8241s + [COUNTERS] CudaCpp MEs ( 2 ) : 19.8118s for 90112 events => throughput is 4.55E+03 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.3322993086655972E-007) and cpp (2.3326296967941821E-007) differ by less than 4E-4 (0.00014165768834106807) +OK! xsec from fortran (2.3322783648085419E-007) and cpp (2.3326074784076956E-007) differ by less than 4E-4 (0.00014111248645076735) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.635666e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.576856e+03 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.624580e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.546931e+03 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 512 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.348580e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.397159e+03 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 512 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.418312e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.533190e+03 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.329606e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.399878e+03 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.375742e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.130059e+03 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.272719e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.400049e+03 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.426495e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.094833e+03 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt index eecc6607f5..dd4d5d35ae 100644 --- a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +Working directory (build): /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg CUDACPP_BUILDDIR='.' - - make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' + make USEBUILDDIR=1 AVX=avx2 + make USEBUILDDIR=1 AVX=512y +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' OMP_NUM_THREADS= -DATE: 2024-02-02_20:22:01 +DATE: 2024-02-08_21:07:57 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +Working directory (run): /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,18 +50,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/avalassi/output_ggttggg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggttggg_x1_fortran > /tmp/valassia/output_ggttggg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.24e-06 [1.2403985227939176E-006] fbridge_mode=0 + [XSECTION] Cross section = 1.24e-06 [1.2403628942014972E-006] fbridge_mode=0 [UNWEIGHT] Wrote 1 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 97.7739s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5058s - [COUNTERS] Fortran MEs ( 1 ) : 97.2681s for 8192 events => throughput is 8.42E+01 events/s + [COUNTERS] PROGRAM TOTAL : 54.4597s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3673s + [COUNTERS] Fortran MEs ( 1 ) : 54.0925s for 8192 events => throughput is 1.51E+02 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,18 +75,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/avalassi/output_ggttggg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggttggg_x1_fortran > /tmp/valassia/output_ggttggg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.24e-06 [1.2403985227939176E-006] fbridge_mode=0 + [XSECTION] Cross section = 1.24e-06 [1.2403628942014972E-006] fbridge_mode=0 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 98.2385s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5079s - [COUNTERS] Fortran MEs ( 1 ) : 97.7306s for 8192 events => throughput is 8.38E+01 events/s + [COUNTERS] PROGRAM TOTAL : 54.5397s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4010s + [COUNTERS] Fortran MEs ( 1 ) : 54.1387s for 8192 events => throughput is 1.51E+02 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,18 +100,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x10_fortran > /tmp/avalassi/output_ggttggg_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggttggg_x10_fortran > /tmp/valassia/output_ggttggg_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.332e-07 [2.3322993086655972E-007] fbridge_mode=0 + [XSECTION] Cross section = 2.332e-07 [2.3322783648085419E-007] fbridge_mode=0 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 1072.2765s - [COUNTERS] Fortran Overhead ( 0 ) : 4.4768s - [COUNTERS] Fortran MEs ( 1 ) : 1067.7997s for 90112 events => throughput is 8.44E+01 events/s + [COUNTERS] PROGRAM TOTAL : 598.2283s + [COUNTERS] Fortran Overhead ( 0 ) : 3.0901s + [COUNTERS] Fortran MEs ( 1 ) : 595.1382s for 90112 events => throughput is 1.51E+02 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,22 +125,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x1_cudacpp > /tmp/valassia/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.24e-06 [1.2403985299359846E-006] fbridge_mode=1 + [XSECTION] Cross section = 1.24e-06 [1.2403629013416990E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 213.7467s - [COUNTERS] Fortran Overhead ( 0 ) : 99.0581s - [COUNTERS] CudaCpp MEs ( 2 ) : 114.6886s for 8192 events => throughput is 7.14E+01 events/s + [COUNTERS] PROGRAM TOTAL : 174.1417s + [COUNTERS] Fortran Overhead ( 0 ) : 80.0402s + [COUNTERS] CudaCpp MEs ( 2 ) : 94.1014s for 8192 events => throughput is 8.71E+01 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.2403985227939176E-006) and cpp (1.2403985299359846E-006) differ by less than 2E-4 (5.7578810608305275e-09) +OK! xsec from fortran (1.2403628942014972E-006) and cpp (1.2403629013416990E-006) differ by less than 2E-4 (5.7565425759520394e-09) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -158,36 +158,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x10_cudacpp > /tmp/valassia/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.332e-07 [2.3322993212353001E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.332e-07 [2.3322783773791503E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 1362.5356s - [COUNTERS] Fortran Overhead ( 0 ) : 103.3719s - [COUNTERS] CudaCpp MEs ( 2 ) : 1259.1637s for 90112 events => throughput is 7.16E+01 events/s + [COUNTERS] PROGRAM TOTAL : 1119.8209s + [COUNTERS] Fortran Overhead ( 0 ) : 82.7419s + [COUNTERS] CudaCpp MEs ( 2 ) : 1037.0790s for 90112 events => throughput is 8.69E+01 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.3322993086655972E-007) and cpp (2.3322993212353001E-007) differ by less than 2E-4 (5.389403812117166e-09) +OK! xsec from fortran (2.3322783648085419E-007) and cpp (2.3322783773791503E-007) differ by less than 2E-4 (5.389840573855054e-09) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.335207e+01 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.026356e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.320228e+01 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.036053e+02 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,22 +201,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x1_cudacpp > /tmp/valassia/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.24e-06 [1.2403985295828473E-006] fbridge_mode=1 + [XSECTION] Cross section = 1.24e-06 [1.2403629009850969E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 112.0066s - [COUNTERS] Fortran Overhead ( 0 ) : 51.7677s - [COUNTERS] CudaCpp MEs ( 2 ) : 60.2389s for 8192 events => throughput is 1.36E+02 events/s + [COUNTERS] PROGRAM TOTAL : 78.5590s + [COUNTERS] Fortran Overhead ( 0 ) : 35.0840s + [COUNTERS] CudaCpp MEs ( 2 ) : 43.4750s for 8192 events => throughput is 1.88E+02 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.2403985227939176E-006) and cpp (1.2403985295828473E-006) differ by less than 2E-4 (5.473184350179849e-09) +OK! xsec from fortran (1.2403628942014972E-006) and cpp (1.2403629009850969E-006) differ by less than 2E-4 (5.469044328521022e-09) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -234,36 +234,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x10_cudacpp > /tmp/valassia/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.332e-07 [2.3322993222645648E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.332e-07 [2.3322783784120318E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 715.0255s - [COUNTERS] Fortran Overhead ( 0 ) : 55.6788s - [COUNTERS] CudaCpp MEs ( 2 ) : 659.3467s for 90112 events => throughput is 1.37E+02 events/s + [COUNTERS] PROGRAM TOTAL : 517.3622s + [COUNTERS] Fortran Overhead ( 0 ) : 37.8419s + [COUNTERS] CudaCpp MEs ( 2 ) : 479.5204s for 90112 events => throughput is 1.88E+02 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.3322993086655972E-007) and cpp (2.3322993222645648E-007) differ by less than 2E-4 (5.8307128014689624e-09) +OK! xsec from fortran (2.3322783648085419E-007) and cpp (2.3322783784120318E-007) differ by less than 2E-4 (5.832704319530535e-09) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.602492e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.373550e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.597872e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.359491e+02 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,22 +277,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x1_cudacpp > /tmp/valassia/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.24e-06 [1.2403985293629285E-006] fbridge_mode=1 + [XSECTION] Cross section = 1.24e-06 [1.2403629007633195E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 48.5543s - [COUNTERS] Fortran Overhead ( 0 ) : 22.3824s - [COUNTERS] CudaCpp MEs ( 2 ) : 26.1720s for 8192 events => throughput is 3.13E+02 events/s + [COUNTERS] PROGRAM TOTAL : 33.9565s + [COUNTERS] Fortran Overhead ( 0 ) : 15.2150s + [COUNTERS] CudaCpp MEs ( 2 ) : 18.7415s for 8192 events => throughput is 4.37E+02 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.2403985227939176E-006) and cpp (1.2403985293629285E-006) differ by less than 2E-4 (5.29588750630694e-09) +OK! xsec from fortran (1.2403628942014972E-006) and cpp (1.2403629007633195E-006) differ by less than 2E-4 (5.290244020628165e-09) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -310,188 +310,40 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x10_cudacpp > /tmp/valassia/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.332e-07 [2.3322993222447204E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.332e-07 [2.3322783783946155E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 315.4178s - [COUNTERS] Fortran Overhead ( 0 ) : 26.4124s - [COUNTERS] CudaCpp MEs ( 2 ) : 289.0053s for 90112 events => throughput is 3.12E+02 events/s + [COUNTERS] PROGRAM TOTAL : 224.7320s + [COUNTERS] Fortran Overhead ( 0 ) : 17.7872s + [COUNTERS] CudaCpp MEs ( 2 ) : 206.9448s for 90112 events => throughput is 4.35E+02 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.3322993086655972E-007) and cpp (2.3322993222447204E-007) differ by less than 2E-4 (5.82220427425284e-09) +OK! xsec from fortran (2.3322783648085419E-007) and cpp (2.3322783783946155E-007) differ by less than 2E-4 (5.825236737422301e-09) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.757760e+02 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.742666e+02 ) sec^-1 - -*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 128/128 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.24e-06 [1.2403985293629285E-006] fbridge_mode=1 - [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 43.1450s - [COUNTERS] Fortran Overhead ( 0 ) : 19.8632s - [COUNTERS] CudaCpp MEs ( 2 ) : 23.2817s for 8192 events => throughput is 3.52E+02 events/s - -*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (1.2403985227939176E-006) and cpp (1.2403985293629285E-006) differ by less than 2E-4 (5.29588750630694e-09) - -*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 128/128 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.332e-07 [2.3322993222447204E-007] fbridge_mode=1 - [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 281.3910s - [COUNTERS] Fortran Overhead ( 0 ) : 23.7458s - [COUNTERS] CudaCpp MEs ( 2 ) : 257.6451s for 90112 events => throughput is 3.50E+02 events/s - -*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (2.3322993086655972E-007) and cpp (2.3322993222447204E-007) differ by less than 2E-4 (5.82220427425284e-09) - -*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.257947e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.612074e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.224870e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.589380e+02 ) sec^-1 -*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 128/128 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.24e-06 [1.2403985293629285E-006] fbridge_mode=1 - [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 45.7000s - [COUNTERS] Fortran Overhead ( 0 ) : 22.4969s - [COUNTERS] CudaCpp MEs ( 2 ) : 23.2032s for 8192 events => throughput is 3.53E+02 events/s - -*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (1.2403985227939176E-006) and cpp (1.2403985293629285E-006) differ by less than 2E-4 (5.29588750630694e-09) - -*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical +*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** -*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 128/128 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.332e-07 [2.3322993222447204E-007] fbridge_mode=1 - [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 283.4810s - [COUNTERS] Fortran Overhead ( 0 ) : 26.1975s - [COUNTERS] CudaCpp MEs ( 2 ) : 257.2834s for 90112 events => throughput is 3.50E+02 events/s - -*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (2.3322993086655972E-007) and cpp (2.3322993222447204E-007) differ by less than 2E-4 (5.82220427425284e-09) - -*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.788017e+02 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.769468e+02 ) sec^-1 +*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -505,22 +357,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggttggg_x1_cudacpp > /tmp/valassia/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.24e-06 [1.2403985217419736E-006] fbridge_mode=1 + [XSECTION] Cross section = 1.24e-06 [1.2403628931370709E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 3.5973s - [COUNTERS] Fortran Overhead ( 0 ) : 2.7362s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.8611s for 8192 events => throughput is 9.51E+03 events/s + [COUNTERS] PROGRAM TOTAL : 12.2525s + [COUNTERS] Fortran Overhead ( 0 ) : 8.0735s + [COUNTERS] CudaCpp MEs ( 2 ) : 4.1789s for 8192 events => throughput is 1.96E+03 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.2403985227939176E-006) and cpp (1.2403985217419736E-006) differ by less than 2E-4 (8.480693924894922e-10) +OK! xsec from fortran (1.2403628942014972E-006) and cpp (1.2403628931370709E-006) differ by less than 2E-4 (8.581571009358413e-10) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -538,65 +390,65 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggttggg_x10_cudacpp > /tmp/valassia/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.332e-07 [2.3322993078576736E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.332e-07 [2.3322783640044522E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 16.1741s - [COUNTERS] Fortran Overhead ( 0 ) : 6.6830s - [COUNTERS] CudaCpp MEs ( 2 ) : 9.4910s for 90112 events => throughput is 9.49E+03 events/s + [COUNTERS] PROGRAM TOTAL : 56.4709s + [COUNTERS] Fortran Overhead ( 0 ) : 10.5806s + [COUNTERS] CudaCpp MEs ( 2 ) : 45.8902s for 90112 events => throughput is 1.96E+03 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.3322993086655972E-007) and cpp (2.3322993078576736E-007) differ by less than 2E-4 (3.4640645907302314e-10) +OK! xsec from fortran (2.3322783648085419E-007) and cpp (2.3322783640044522E-007) differ by less than 2E-4 (3.447657714872321e-10) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.460914e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.979939e+03 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.085934e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.998270e+03 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 512 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.112990e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.330145e+03 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 512 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.159841e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.387574e+03 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.108106e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.328512e+03 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.107813e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.245538e+03 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.114186e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.317098e+03 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.641595e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.085726e+03 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt index b178ee423e..1e31973081 100644 --- a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu +Working directory (build): /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu CUDACPP_BUILDDIR='.' - - make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' + make USEBUILDDIR=1 AVX=avx2 + make USEBUILDDIR=1 AVX=512y +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' OMP_NUM_THREADS= -DATE: 2024-02-02_17:46:04 +DATE: 2024-02-08_19:50:18 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +Working directory (run): /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,18 +50,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/avalassi/output_gqttq_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_gqttq_x1_fortran > /tmp/valassia/output_gqttq_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2711 [0.27110539351263330] fbridge_mode=0 + [XSECTION] Cross section = 0.2711 [0.27110226551166922] fbridge_mode=0 [UNWEIGHT] Wrote 404 events (found 1817 events) - [COUNTERS] PROGRAM TOTAL : 0.4637s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3912s - [COUNTERS] Fortran MEs ( 1 ) : 0.0726s for 8192 events => throughput is 1.13E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4530s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4049s + [COUNTERS] Fortran MEs ( 1 ) : 0.0482s for 8192 events => throughput is 1.70E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,18 +75,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/avalassi/output_gqttq_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_gqttq_x1_fortran > /tmp/valassia/output_gqttq_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2711 [0.27110539351263330] fbridge_mode=0 + [XSECTION] Cross section = 0.2711 [0.27110226551166922] fbridge_mode=0 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.3923s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3204s - [COUNTERS] Fortran MEs ( 1 ) : 0.0719s for 8192 events => throughput is 1.14E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3084s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2603s + [COUNTERS] Fortran MEs ( 1 ) : 0.0481s for 8192 events => throughput is 1.70E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,18 +100,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x10_fortran > /tmp/avalassi/output_gqttq_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_gqttq_x10_fortran > /tmp/valassia/output_gqttq_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2151 [0.21510686556561290] fbridge_mode=0 + [XSECTION] Cross section = 0.2151 [0.21510679754343823] fbridge_mode=0 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 2.3323s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5501s - [COUNTERS] Fortran MEs ( 1 ) : 0.7821s for 90112 events => throughput is 1.15E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.7392s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2138s + [COUNTERS] Fortran MEs ( 1 ) : 0.5253s for 90112 events => throughput is 1.72E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,22 +125,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2711 [0.27110539351263335] fbridge_mode=1 + [XSECTION] Cross section = 0.2711 [0.27110226551166922] fbridge_mode=1 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.4815s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4033s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0782s for 8192 events => throughput is 1.05E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4186s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3493s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0693s for 8192 events => throughput is 1.18E+05 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.27110539351263330) and cpp (0.27110539351263335) differ by less than 3E-14 (2.220446049250313e-16) +OK! xsec from fortran (0.27110226551166922) and cpp (0.27110226551166922) differ by less than 3E-14 (0.0) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -158,36 +158,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x10_cudacpp > /tmp/valassia/output_gqttq_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2151 [0.21510686556561287] fbridge_mode=1 + [XSECTION] Cross section = 0.2151 [0.21510679754343820] fbridge_mode=1 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 2.5134s - [COUNTERS] Fortran Overhead ( 0 ) : 1.6501s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.8633s for 90112 events => throughput is 1.04E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.0585s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2942s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.7643s for 90112 events => throughput is 1.18E+05 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21510686556561290) and cpp (0.21510686556561287) differ by less than 3E-14 (1.1102230246251565e-16) +OK! xsec from fortran (0.21510679754343823) and cpp (0.21510679754343820) differ by less than 3E-14 (1.1102230246251565e-16) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.048855e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.208650e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.034927e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.207106e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,22 +201,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2711 [0.27110539351262536] fbridge_mode=1 + [XSECTION] Cross section = 0.2711 [0.27110226551166122] fbridge_mode=1 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.4072s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3668s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0404s for 8192 events => throughput is 2.03E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3310s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2978s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0332s for 8192 events => throughput is 2.47E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.27110539351263330) and cpp (0.27110539351262536) differ by less than 3E-14 (2.930988785010413e-14) +OK! xsec from fortran (0.27110226551166922) and cpp (0.27110226551166122) differ by less than 3E-14 (2.9531932455029164e-14) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -234,36 +234,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x10_cudacpp > /tmp/valassia/output_gqttq_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2151 [0.21510686556561290] fbridge_mode=1 + [XSECTION] Cross section = 0.2151 [0.21510679754343823] fbridge_mode=1 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 2.1239s - [COUNTERS] Fortran Overhead ( 0 ) : 1.6580s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.4659s for 90112 events => throughput is 1.93E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.6148s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2494s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3654s for 90112 events => throughput is 2.47E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21510686556561290) and cpp (0.21510686556561290) differ by less than 3E-14 (0.0) +OK! xsec from fortran (0.21510679754343823) and cpp (0.21510679754343823) differ by less than 3E-14 (0.0) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.002182e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.506508e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.030333e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.463460e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,22 +277,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2711 [0.27110539351263341] fbridge_mode=1 + [XSECTION] Cross section = 0.2711 [0.27110226551166922] fbridge_mode=1 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.3746s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3513s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0234s for 8192 events => throughput is 3.50E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2986s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2814s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0172s for 8192 events => throughput is 4.75E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.27110539351263330) and cpp (0.27110539351263341) differ by less than 3E-14 (4.440892098500626e-16) +OK! xsec from fortran (0.27110226551166922) and cpp (0.27110226551166922) differ by less than 3E-14 (0.0) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -310,188 +310,40 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x10_cudacpp > /tmp/valassia/output_gqttq_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2151 [0.21510686556561295] fbridge_mode=1 + [XSECTION] Cross section = 0.2151 [0.21510679754343823] fbridge_mode=1 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 1.8582s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5967s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2615s for 90112 events => throughput is 3.45E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.4401s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2494s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1908s for 90112 events => throughput is 4.72E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21510686556561290) and cpp (0.21510686556561295) differ by less than 3E-14 (2.220446049250313e-16) +OK! xsec from fortran (0.21510679754343823) and cpp (0.21510679754343823) differ by less than 3E-14 (0.0) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.394744e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.791511e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.397811e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.864406e+05 ) sec^-1 -*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/32 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2711 [0.27110539351263341] fbridge_mode=1 - [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.3648s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3439s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0208s for 8192 events => throughput is 3.93E+05 events/s - -*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (0.27110539351263330) and cpp (0.27110539351263341) differ by less than 3E-14 (4.440892098500626e-16) - -*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** +*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/32 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2151 [0.21510686556561295] fbridge_mode=1 - [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 1.8268s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5959s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2309s for 90112 events => throughput is 3.90E+05 events/s - -*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (0.21510686556561290) and cpp (0.21510686556561295) differ by less than 3E-14 (2.220446049250313e-16) - -*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.878315e+05 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.871980e+05 ) sec^-1 - -*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/32 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2711 [0.27110539351263341] fbridge_mode=1 - [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.3908s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3587s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0321s for 8192 events => throughput is 2.55E+05 events/s - -*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (0.27110539351263330) and cpp (0.27110539351263341) differ by less than 3E-14 (4.440892098500626e-16) - -*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/32 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2151 [0.21510686556561295] fbridge_mode=1 - [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 1.9607s - [COUNTERS] Fortran Overhead ( 0 ) : 1.6225s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3383s for 90112 events => throughput is 2.66E+05 events/s - -*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (0.21510686556561290) and cpp (0.21510686556561295) differ by less than 3E-14 (2.220446049250313e-16) - -*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.581657e+05 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.673068e+05 ) sec^-1 +*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -505,98 +357,15 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/32 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2711 [0.27110539351263352] fbridge_mode=1 - [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.7520s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7513s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.20E+07 events/s - -*** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (0.27110539351263330) and cpp (0.27110539351263352) differ by less than 3E-14 (8.881784197001252e-16) - -*** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical - -*** (3) EXECUTE MADEVENT_CUDA x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/32 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2151 [0.21510686556561298] fbridge_mode=1 - [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 2.0080s - [COUNTERS] Fortran Overhead ( 0 ) : 2.0002s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0078s for 90112 events => throughput is 1.15E+07 events/s - -*** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (0.21510686556561290) and cpp (0.21510686556561298) differ by less than 3E-14 (4.440892098500626e-16) - -*** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical - -*** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.481010e+07 ) sec^-1 - -*** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.144595e+07 ) sec^-1 - -*** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.387721e+07 ) sec^-1 - -*** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.496552e+07 ) sec^-1 - -*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.387837e+07 ) sec^-1 - -*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.772413e+07 ) sec^-1 - -*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.394383e+07 ) sec^-1 +Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' +ERROR! ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' failed + PDF set = nn23lo1 + alpha_s(Mz)= 0.1300 running at 2 loops. + alpha_s(Mz)= 0.1300 running at 2 loops. + Renormalization scale set on event-by-event basis + Factorization scale set on event-by-event basis -*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.774015e+07 ) sec^-1 -TEST COMPLETED + getting user params +Enter number of events and max and min iterations: + Number of events and iterations 8192 1 1 diff --git a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt index d9952f5cc5..3529f52591 100644 --- a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu +Working directory (build): /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu CUDACPP_BUILDDIR='.' make USEBUILDDIR=1 AVX=none - - make USEBUILDDIR=1 AVX=sse4 +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' + make USEBUILDDIR=1 AVX=avx2 + make USEBUILDDIR=1 AVX=512y +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' OMP_NUM_THREADS= -DATE: 2024-02-02_17:46:35 +DATE: 2024-02-08_19:50:40 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +Working directory (run): /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,18 +50,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/avalassi/output_gqttq_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_gqttq_x1_fortran > /tmp/valassia/output_gqttq_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2711 [0.27110539351263330] fbridge_mode=0 + [XSECTION] Cross section = 0.2711 [0.27110226551166922] fbridge_mode=0 [UNWEIGHT] Wrote 404 events (found 1817 events) - [COUNTERS] PROGRAM TOTAL : 0.4565s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3855s - [COUNTERS] Fortran MEs ( 1 ) : 0.0710s for 8192 events => throughput is 1.15E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3537s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3056s + [COUNTERS] Fortran MEs ( 1 ) : 0.0481s for 8192 events => throughput is 1.70E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,18 +75,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/avalassi/output_gqttq_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_gqttq_x1_fortran > /tmp/valassia/output_gqttq_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2711 [0.27110539351263330] fbridge_mode=0 + [XSECTION] Cross section = 0.2711 [0.27110226551166922] fbridge_mode=0 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.3928s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3210s - [COUNTERS] Fortran MEs ( 1 ) : 0.0718s for 8192 events => throughput is 1.14E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3119s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2638s + [COUNTERS] Fortran MEs ( 1 ) : 0.0481s for 8192 events => throughput is 1.70E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,18 +100,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x10_fortran > /tmp/avalassi/output_gqttq_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_gqttq_x10_fortran > /tmp/valassia/output_gqttq_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2151 [0.21510686556561290] fbridge_mode=0 + [XSECTION] Cross section = 0.2151 [0.21510679754343823] fbridge_mode=0 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 2.3373s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5510s - [COUNTERS] Fortran MEs ( 1 ) : 0.7864s for 90112 events => throughput is 1.15E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.7486s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2222s + [COUNTERS] Fortran MEs ( 1 ) : 0.5263s for 90112 events => throughput is 1.71E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,22 +125,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2711 [0.27110461852325612] fbridge_mode=1 + [XSECTION] Cross section = 0.2711 [0.27110149549279866] fbridge_mode=1 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.4660s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3955s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0705s for 8192 events => throughput is 1.16E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3774s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3210s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0563s for 8192 events => throughput is 1.45E+05 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.27110539351263330) and cpp (0.27110461852325612) differ by less than 4E-4 (2.8586276618058903e-06) +OK! xsec from fortran (0.27110226551166922) and cpp (0.27110149549279866) differ by less than 4E-4 (2.840326210895583e-06) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -158,36 +158,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x10_cudacpp > /tmp/valassia/output_gqttq_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2151 [0.21510685241079500] fbridge_mode=1 + [XSECTION] Cross section = 0.2151 [0.21510678843355344] fbridge_mode=1 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 2.4822s - [COUNTERS] Fortran Overhead ( 0 ) : 1.6829s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.7992s for 90112 events => throughput is 1.13E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.8952s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2754s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.6199s for 90112 events => throughput is 1.45E+05 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21510686556561290) and cpp (0.21510685241079500) differ by less than 4E-4 (6.11548025553077e-08) +OK! xsec from fortran (0.21510679754343823) and cpp (0.21510678843355344) differ by less than 4E-4 (4.2350520312872675e-08) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.184904e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.480352e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.180610e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.487866e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,22 +201,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2711 [0.27110456793177945] fbridge_mode=1 + [XSECTION] Cross section = 0.2711 [0.27110146988852984] fbridge_mode=1 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.3737s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3493s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0244s for 8192 events => throughput is 3.36E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3050s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2850s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0200s for 8192 events => throughput is 4.10E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.27110539351263330) and cpp (0.27110456793177945) differ by less than 4E-4 (3.0452395031188573e-06) +OK! xsec from fortran (0.27110226551166922) and cpp (0.27110146988852984) differ by less than 4E-4 (2.934771267448788e-06) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -234,36 +234,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x10_cudacpp > /tmp/valassia/output_gqttq_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2151 [0.21510681375304044] fbridge_mode=1 + [XSECTION] Cross section = 0.2151 [0.21510676993136629] fbridge_mode=1 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 1.8670s - [COUNTERS] Fortran Overhead ( 0 ) : 1.6025s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2645s for 90112 events => throughput is 3.41E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.4630s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2419s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2211s for 90112 events => throughput is 4.08E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21510686556561290) and cpp (0.21510681375304044) differ by less than 4E-4 (2.408689854238588e-07) +OK! xsec from fortran (0.21510679754343823) and cpp (0.21510676993136629) differ by less than 4E-4 (1.2836447871311663e-07) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.360736e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.182134e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.377049e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.303630e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,22 +277,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2711 [0.27110458350871136] fbridge_mode=1 + [XSECTION] Cross section = 0.2711 [0.27110148793566186] fbridge_mode=1 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.3515s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3389s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0126s for 8192 events => throughput is 6.49E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2826s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2731s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0096s for 8192 events => throughput is 8.56E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.27110539351263330) and cpp (0.27110458350871136) differ by less than 4E-4 (2.987782395047489e-06) +OK! xsec from fortran (0.27110226551166922) and cpp (0.27110148793566186) differ by less than 4E-4 (2.8682018052839098e-06) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -310,188 +310,40 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x10_cudacpp > /tmp/valassia/output_gqttq_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2151 [0.21510680866622453] fbridge_mode=1 + [XSECTION] Cross section = 0.2151 [0.21510676419088856] fbridge_mode=1 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 1.7300s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5911s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1389s for 90112 events => throughput is 6.49E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.3319s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2267s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1052s for 90112 events => throughput is 8.57E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21510686556561290) and cpp (0.21510680866622453) differ by less than 4E-4 (2.6451684009831666e-07) +OK! xsec from fortran (0.21510679754343823) and cpp (0.21510676419088856) differ by less than 4E-4 (1.5505111905511626e-07) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.333530e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.765258e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.402861e+05 ) sec^-1 - -*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/32 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2711 [0.27110458350871136] fbridge_mode=1 - [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.3487s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3374s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0113s for 8192 events => throughput is 7.26E+05 events/s - -*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (0.27110539351263330) and cpp (0.27110458350871136) differ by less than 4E-4 (2.987782395047489e-06) - -*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/32 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2151 [0.21510680866622453] fbridge_mode=1 - [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 1.7146s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5901s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1245s for 90112 events => throughput is 7.24E+05 events/s - -*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.816790e+05 ) sec^-1 -OK! xsec from fortran (0.21510686556561290) and cpp (0.21510680866622453) differ by less than 4E-4 (2.6451684009831666e-07) +*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** -*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.145200e+05 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.038764e+05 ) sec^-1 - -*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/32 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2711 [0.27110464176080312] fbridge_mode=1 - [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.3574s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3417s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0157s for 8192 events => throughput is 5.23E+05 events/s - -*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (0.27110539351263330) and cpp (0.27110464176080312) differ by less than 4E-4 (2.772913590631809e-06) - -*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/32 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2151 [0.21510685411522340] fbridge_mode=1 - [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 1.7692s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5936s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1756s for 90112 events => throughput is 5.13E+05 events/s - -*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (0.21510686556561290) and cpp (0.21510685411522340) differ by less than 4E-4 (5.3231167029821336e-08) - -*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.942377e+05 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.961752e+05 ) sec^-1 +*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -505,98 +357,15 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/32 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2711 [0.27110478167944563] fbridge_mode=1 - [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.7556s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7551s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0005s for 8192 events => throughput is 1.60E+07 events/s - -*** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (0.27110539351263330) and cpp (0.27110478167944563) differ by less than 4E-4 (2.2568093527297606e-06) - -*** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical - -*** (3) EXECUTE MADEVENT_CUDA x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/32 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2151 [0.21510689885789416] fbridge_mode=1 - [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 2.0066s - [COUNTERS] Fortran Overhead ( 0 ) : 2.0005s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0061s for 90112 events => throughput is 1.48E+07 events/s - -*** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (0.21510686556561290) and cpp (0.21510689885789416) differ by less than 4E-4 (1.547708909921397e-07) - -*** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical - -*** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.790476e+07 ) sec^-1 - -*** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.429082e+07 ) sec^-1 - -*** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.864574e+07 ) sec^-1 - -*** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.698152e+08 ) sec^-1 - -*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.769393e+07 ) sec^-1 - -*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.780344e+08 ) sec^-1 - -*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.349514e+07 ) sec^-1 +Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' +ERROR! ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' failed + PDF set = nn23lo1 + alpha_s(Mz)= 0.1300 running at 2 loops. + alpha_s(Mz)= 0.1300 running at 2 loops. + Renormalization scale set on event-by-event basis + Factorization scale set on event-by-event basis -*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.932763e+07 ) sec^-1 -TEST COMPLETED + getting user params +Enter number of events and max and min iterations: + Number of events and iterations 8192 1 1 diff --git a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt index ada324b44d..7d9cc4ceb3 100644 --- a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu +Working directory (build): /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu CUDACPP_BUILDDIR='.' - - make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' + make USEBUILDDIR=1 AVX=avx2 + make USEBUILDDIR=1 AVX=512y +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' OMP_NUM_THREADS= -DATE: 2024-02-02_17:47:05 +DATE: 2024-02-08_19:51:00 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +Working directory (run): /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,18 +50,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/avalassi/output_gqttq_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_gqttq_x1_fortran > /tmp/valassia/output_gqttq_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2711 [0.27110539351263330] fbridge_mode=0 + [XSECTION] Cross section = 0.2711 [0.27110226551166922] fbridge_mode=0 [UNWEIGHT] Wrote 404 events (found 1817 events) - [COUNTERS] PROGRAM TOTAL : 0.4566s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3850s - [COUNTERS] Fortran MEs ( 1 ) : 0.0716s for 8192 events => throughput is 1.14E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3552s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3070s + [COUNTERS] Fortran MEs ( 1 ) : 0.0481s for 8192 events => throughput is 1.70E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,18 +75,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/avalassi/output_gqttq_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_gqttq_x1_fortran > /tmp/valassia/output_gqttq_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2711 [0.27110539351263330] fbridge_mode=0 + [XSECTION] Cross section = 0.2711 [0.27110226551166922] fbridge_mode=0 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.3934s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3225s - [COUNTERS] Fortran MEs ( 1 ) : 0.0709s for 8192 events => throughput is 1.15E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3113s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2631s + [COUNTERS] Fortran MEs ( 1 ) : 0.0482s for 8192 events => throughput is 1.70E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,18 +100,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x10_fortran > /tmp/avalassi/output_gqttq_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_gqttq_x10_fortran > /tmp/valassia/output_gqttq_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2151 [0.21510686556561290] fbridge_mode=0 + [XSECTION] Cross section = 0.2151 [0.21510679754343823] fbridge_mode=0 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 2.3290s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5481s - [COUNTERS] Fortran MEs ( 1 ) : 0.7809s for 90112 events => throughput is 1.15E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.7438s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2181s + [COUNTERS] Fortran MEs ( 1 ) : 0.5256s for 90112 events => throughput is 1.71E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,22 +125,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2711 [0.27110539348915991] fbridge_mode=1 + [XSECTION] Cross section = 0.2711 [0.27110226549005623] fbridge_mode=1 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.4841s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4052s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0790s for 8192 events => throughput is 1.04E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4037s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3344s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0692s for 8192 events => throughput is 1.18E+05 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.27110539351263330) and cpp (0.27110539348915991) differ by less than 2E-4 (8.658396222216425e-11) +OK! xsec from fortran (0.27110226551166922) and cpp (0.27110226549005623) differ by less than 2E-4 (7.972267290767832e-11) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -158,36 +158,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x10_cudacpp > /tmp/valassia/output_gqttq_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2151 [0.21510686560794334] fbridge_mode=1 + [XSECTION] Cross section = 0.2151 [0.21510679758658835] fbridge_mode=1 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 2.5258s - [COUNTERS] Fortran Overhead ( 0 ) : 1.6527s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.8731s for 90112 events => throughput is 1.03E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.0503s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2898s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.7605s for 90112 events => throughput is 1.18E+05 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21510686556561290) and cpp (0.21510686560794334) differ by less than 2E-4 (1.967879192932287e-10) +OK! xsec from fortran (0.21510679754343823) and cpp (0.21510679758658835) differ by less than 2E-4 (2.0059864880295208e-10) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.037120e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.201729e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.045880e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.203533e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,22 +201,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2711 [0.27110539348916002] fbridge_mode=1 + [XSECTION] Cross section = 0.2711 [0.27110226549005628] fbridge_mode=1 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.4045s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3647s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0399s for 8192 events => throughput is 2.05E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3288s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2961s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0327s for 8192 events => throughput is 2.51E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.27110539351263330) and cpp (0.27110539348916002) differ by less than 2E-4 (8.658362915525686e-11) +OK! xsec from fortran (0.27110226551166922) and cpp (0.27110226549005628) differ by less than 2E-4 (7.972245086307339e-11) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -234,36 +234,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x10_cudacpp > /tmp/valassia/output_gqttq_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2151 [0.21510686560794337] fbridge_mode=1 + [XSECTION] Cross section = 0.2151 [0.21510679758658832] fbridge_mode=1 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 2.0636s - [COUNTERS] Fortran Overhead ( 0 ) : 1.6186s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.4450s for 90112 events => throughput is 2.02E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.6141s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2546s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3595s for 90112 events => throughput is 2.51E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21510686556561290) and cpp (0.21510686560794337) differ by less than 2E-4 (1.9678814133783362e-10) +OK! xsec from fortran (0.21510679754343823) and cpp (0.21510679758658832) differ by less than 2E-4 (2.0059842675834716e-10) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.983932e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.516035e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.998905e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.520975e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,22 +277,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2711 [0.27110539330272815] fbridge_mode=1 + [XSECTION] Cross section = 0.2711 [0.27110226530029391] fbridge_mode=1 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.3739s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3504s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0234s for 8192 events => throughput is 3.49E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2991s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2817s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0174s for 8192 events => throughput is 4.70E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.27110539351263330) and cpp (0.27110539330272815) differ by less than 2E-4 (7.742566587864985e-10) +OK! xsec from fortran (0.27110226551166922) and cpp (0.27110226530029391) differ by less than 2E-4 (7.796884249344771e-10) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -310,188 +310,40 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x10_cudacpp > /tmp/valassia/output_gqttq_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2151 [0.21510686558551750] fbridge_mode=1 + [XSECTION] Cross section = 0.2151 [0.21510679756340242] fbridge_mode=1 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 1.8560s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5979s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2581s for 90112 events => throughput is 3.49E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.4227s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2332s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1895s for 90112 events => throughput is 4.76E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21510686556561290) and cpp (0.21510686558551750) differ by less than 2E-4 (9.2533536388828e-11) +OK! xsec from fortran (0.21510679754343823) and cpp (0.21510679756340242) differ by less than 2E-4 (9.281064805577444e-11) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.484446e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.844894e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.492862e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.853378e+05 ) sec^-1 -*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/32 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2711 [0.27110539330272815] fbridge_mode=1 - [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.3676s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3467s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0209s for 8192 events => throughput is 3.91E+05 events/s - -*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (0.27110539351263330) and cpp (0.27110539330272815) differ by less than 2E-4 (7.742566587864985e-10) - -*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** +*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/32 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2151 [0.21510686558551750] fbridge_mode=1 - [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 1.8166s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5929s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2237s for 90112 events => throughput is 4.03E+05 events/s - -*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (0.21510686556561290) and cpp (0.21510686558551750) differ by less than 2E-4 (9.2533536388828e-11) - -*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.972921e+05 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.030453e+05 ) sec^-1 - -*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/32 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2711 [0.27110539330272815] fbridge_mode=1 - [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.3911s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3597s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0313s for 8192 events => throughput is 2.62E+05 events/s - -*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (0.27110539351263330) and cpp (0.27110539330272815) differ by less than 2E-4 (7.742566587864985e-10) - -*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/32 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2151 [0.21510686558551750] fbridge_mode=1 - [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 1.9709s - [COUNTERS] Fortran Overhead ( 0 ) : 1.6194s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3515s for 90112 events => throughput is 2.56E+05 events/s - -*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (0.21510686556561290) and cpp (0.21510686558551750) differ by less than 2E-4 (9.2533536388828e-11) - -*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.617505e+05 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.601338e+05 ) sec^-1 +*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -505,98 +357,15 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/32 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2711 [0.27110539343558532] fbridge_mode=1 - [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.7538s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7531s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.24E+07 events/s - -*** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (0.27110539351263330) and cpp (0.27110539343558532) differ by less than 2E-4 (2.8419933073564607e-10) - -*** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical - -*** (3) EXECUTE MADEVENT_CUDA x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/32 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2151 [0.21510686553631395] fbridge_mode=1 - [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 2.0009s - [COUNTERS] Fortran Overhead ( 0 ) : 1.9932s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0077s for 90112 events => throughput is 1.17E+07 events/s - -*** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (0.21510686556561290) and cpp (0.21510686553631395) differ by less than 2E-4 (1.3620649053081024e-10) - -*** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical - -*** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.595711e+07 ) sec^-1 - -*** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.107192e+07 ) sec^-1 - -*** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.381138e+07 ) sec^-1 - -*** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.518494e+07 ) sec^-1 - -*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.385327e+07 ) sec^-1 - -*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.813672e+07 ) sec^-1 - -*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.399848e+07 ) sec^-1 +Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' +ERROR! ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' failed + PDF set = nn23lo1 + alpha_s(Mz)= 0.1300 running at 2 loops. + alpha_s(Mz)= 0.1300 running at 2 loops. + Renormalization scale set on event-by-event basis + Factorization scale set on event-by-event basis -*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.781110e+07 ) sec^-1 -TEST COMPLETED + getting user params +Enter number of events and max and min iterations: + Number of events and iterations 8192 1 1 diff --git a/epochX/cudacpp/tput/allTees.sh b/epochX/cudacpp/tput/allTees.sh index 9be4f5d4fc..55a7e010ea 100755 --- a/epochX/cudacpp/tput/allTees.sh +++ b/epochX/cudacpp/tput/allTees.sh @@ -32,8 +32,11 @@ while [ "$1" != "" ]; do opts+=" -makeonly" shift elif [ "$1" == "-hip" ]; then - # Random numbers use rocrand instead of curand? - rndhst=-rorhst + #### Random numbers use hiprand instead of curand? + ###rndhst=-hirhst + # See https://github.com/ROCm/hipRAND/issues/76 + # Random numbers use common (not hiprand) instead of curand? + rndhst=-common shift else echo "Usage: $0 [-short] [-e] [-sa] [-makeonly] [-hip]" @@ -77,8 +80,13 @@ ended4="$cmd\nENDED(4) AT $(date) [Status=$status]" # (72/78) Two extra logs (double/float x hrd0 x inl0 + rndhst) only in three of the six processes (no rebuild needed) cmd="./tput/teeThroughputX.sh -eemumu -ggtt -ggttgg -flt ${rndhst} ${opts}" -$cmd; status=$? -ended5="$cmd\nENDED(5) AT $(date) [Status=$status]" +if [ "${rndhst}" != "-common" ]; then + $cmd; status=$? + ended5="$cmd\nENDED(5) AT $(date) [Status=$status]" +else + cmd="SKIP '$cmd'"; echo $cmd; status=$? + ended5="$cmd\nENDED(5) AT $(date) [Status=$status]" +fi # (78/78) Two extra logs (double/float x hrd0 x inl0 + common) only in three of the six processes (no rebuild needed) cmd="./tput/teeThroughputX.sh -eemumu -ggtt -ggttgg -flt -common ${opts}" @@ -98,6 +106,7 @@ echo -e "$ended2" echo -e "$ended3" echo -e "$ended4" echo -e "$ended5" +echo -e "$ended6" if [ "$ggttggg" == "" ]; then echo diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt index 774b5ce9b2..ace66bb6ed 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt @@ -1,209 +1,168 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +HASCURAND=hasNoCurand +HASHIPRAND=hasHiprand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-02-02_16:29:54 +DATE: 2024-02-08_18:16:31 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.732881e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.331651e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.299488e+08 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 0.806481 sec - 2,844,929,168 cycles # 3.002 GHz - 4,476,498,275 instructions # 1.57 insn per cycle - 1.144252755 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 166 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 5.334674e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.108654e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.338540e+07 ) sec^-1 +MeanMatrixElemValue = ( 1.371632e-02 +- 3.269165e-06 ) GeV^0 +TOTAL : 1.109691 sec + 1,247,687,096 cycles:u # 1.072 GHz (74.94%) + 2,278,286 stalled-cycles-frontend:u # 0.18% frontend cycles idle (74.54%) + 5,704,503 stalled-cycles-backend:u # 0.46% backend cycles idle (75.11%) + 2,089,277,539 instructions:u # 1.67 insn per cycle + # 0.00 stalled cycles per insn (76.16%) + 1.704153461 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282804e-02 -Avg ME (F77/CUDA) = 1.2828039868165201E-002 -Relative difference = 1.0277080522138477e-08 +Avg ME (F77/CUDA) = 1.2828039868165208E-002 +Relative difference = 1.0277079981222336e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.051572e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.222328e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.222328e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 6.380846 sec - 19,508,626,142 cycles # 3.056 GHz - 46,933,131,885 instructions # 2.41 insn per cycle - 6.390323685 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 472) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.127928e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.288471e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.288471e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 6.367604 sec + 19,491,485,206 cycles:u # 3.046 GHz (75.00%) + 51,794,140 stalled-cycles-frontend:u # 0.27% frontend cycles idle (75.00%) + 47,658,866 stalled-cycles-backend:u # 0.24% backend cycles idle (75.00%) + 47,117,293,535 instructions:u # 2.42 insn per cycle + # 0.00 stalled cycles per insn (75.00%) + 6.401683865 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 471) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.670212e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.190161e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.190161e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.158860 sec - 12,830,579,051 cycles # 3.081 GHz - 31,183,618,088 instructions # 2.43 insn per cycle - 4.174880373 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.745857e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.192455e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.192455e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 4.398930 sec + 13,221,919,735 cycles:u # 2.985 GHz (74.90%) + 52,263,631 stalled-cycles-frontend:u # 0.40% frontend cycles idle (74.97%) + 1,044,297,541 stalled-cycles-backend:u # 7.90% backend cycles idle (75.06%) + 31,151,962,295 instructions:u # 2.36 insn per cycle + # 0.03 stalled cycles per insn (75.08%) + 4.433514520 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1626) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.059286e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.882634e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.882634e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.452724 sec - 10,016,869,803 cycles # 2.896 GHz - 19,479,397,734 instructions # 1.94 insn per cycle - 3.466531959 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1964) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.408381e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.196444e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.196444e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 3.397303 sec + 10,167,564,191 cycles:u # 2.966 GHz (74.91%) + 51,874,302 stalled-cycles-frontend:u # 0.51% frontend cycles idle (75.00%) + 436,450,232 stalled-cycles-backend:u # 4.29% backend cycles idle (75.03%) + 19,342,752,766 instructions:u # 1.90 insn per cycle + # 0.02 stalled cycles per insn (75.03%) + 3.431966834 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1946) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165090E-002 Relative difference = 1.0277089176796747e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe -p 2048 256 12 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.205390e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.192354e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.192354e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.245257 sec - 9,575,667,027 cycles # 2.948 GHz - 18,941,225,947 instructions # 1.98 insn per cycle - 3.261392204 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1655) (512y: 161) (512z: 0) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165090E-002 -Relative difference = 1.0277089176796747e-08 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe -p 2048 256 12 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.990111e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.736341e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.736341e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.560844 sec - 8,171,660,854 cycles # 2.293 GHz - 15,512,522,300 instructions # 1.90 insn per cycle - 3.578180530 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 920) (512y: 59) (512z: 1220) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165090E-002 -Relative difference = 1.0277089176796747e-08 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt index 6eb637fbed..a02e8efa0d 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt @@ -1,222 +1,175 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +HASCURAND=hasNoCurand +HASHIPRAND=hasHiprand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-02-02_17:09:39 +DATE: 2024-02-08_19:08:08 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 12 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 12 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.476133e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.502081e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.502081e+07 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 2.311896 sec - 7,339,264,792 cycles # 2.882 GHz - 12,967,773,582 instructions # 1.77 insn per cycle - 2.616359359 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -==PROF== Profiling "sigmaKin": launch__registers_per_thread 166 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 6.485774e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.352334e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.352334e+07 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 5.544822 sec + 18,357,049,736 cycles:u # 3.290 GHz (75.00%) + 121,804,388 stalled-cycles-frontend:u # 0.66% frontend cycles idle (75.00%) + 6,984,637,879 stalled-cycles-backend:u # 38.05% backend cycles idle (74.99%) + 17,123,769,634 instructions:u # 0.93 insn per cycle + # 0.41 stalled cycles per insn (75.06%) + 5.607290014 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282804e-02 -Avg ME (F77/CUDA) = 1.2828039868165201E-002 -Relative difference = 1.0277080522138477e-08 +Avg ME (F77/CUDA) = 1.2828039868165208E-002 +Relative difference = 1.0277079981222336e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.006057e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.161244e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.161244e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 6.853865 sec - 20,723,629,478 cycles # 3.021 GHz - 47,159,413,780 instructions # 2.28 insn per cycle - 6.861488246 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 472) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.234311e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.409182e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.409182e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 5.942439 sec + 19,893,415,201 cycles:u # 3.324 GHz (75.01%) + 52,113,264 stalled-cycles-frontend:u # 0.26% frontend cycles idle (75.02%) + 124,246,256 stalled-cycles-backend:u # 0.62% backend cycles idle (75.01%) + 47,361,048,369 instructions:u # 2.38 insn per cycle + # 0.00 stalled cycles per insn (75.01%) + 5.987785762 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 471) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.549174e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.990542e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.990542e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.664182 sec - 14,080,140,987 cycles # 3.015 GHz - 32,025,465,654 instructions # 2.27 insn per cycle - 4.671778204 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.868611e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.336418e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.336418e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 4.242379 sec + 13,941,244,348 cycles:u # 3.253 GHz (75.00%) + 53,854,319 stalled-cycles-frontend:u # 0.39% frontend cycles idle (75.00%) + 983,287,187 stalled-cycles-backend:u # 7.05% backend cycles idle (74.99%) + 31,985,098,389 instructions:u # 2.29 insn per cycle + # 0.03 stalled cycles per insn (74.99%) + 4.292601985 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1626) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.883823e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.570155e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.570155e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.970337 sec - 11,331,945,219 cycles # 2.851 GHz - 20,844,801,631 instructions # 1.84 insn per cycle - 3.978045545 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1964) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165090E-002 -Relative difference = 1.0277089176796747e-08 -OK (relative difference <= 5E-3) +EvtsPerSec[Rmb+ME] (23) = ( 2.533894e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.319888e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.319888e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 3.369509 sec + 10,904,824,724 cycles:u # 3.196 GHz (74.97%) + 51,264,642 stalled-cycles-frontend:u # 0.47% frontend cycles idle (74.90%) + 495,017,962 stalled-cycles-backend:u # 4.54% backend cycles idle (74.92%) + 20,691,138,731 instructions:u # 1.90 insn per cycle + # 0.02 stalled cycles per insn (74.97%) + 3.416560925 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1946) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.020487e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.815821e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.815821e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.732253 sec - 10,845,348,709 cycles # 2.901 GHz - 20,302,403,026 instructions # 1.87 insn per cycle - 3.739927960 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1655) (512y: 161) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165090E-002 Relative difference = 1.0277089176796747e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.805048e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.412722e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.412722e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.111791 sec - 9,508,360,053 cycles # 2.310 GHz - 16,665,011,626 instructions # 1.75 insn per cycle - 4.119278704 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 920) (512y: 59) (512z: 1220) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165090E-002 -Relative difference = 1.0277089176796747e-08 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt index 604bbaf7d3..c001fa8fed 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt @@ -1,209 +1,165 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +HASCURAND=hasNoCurand +HASHIPRAND=hasHiprand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-02-02_17:23:09 +DATE: 2024-02-08_19:19:38 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 12 --common OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 12 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.485352e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.577711e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.136362e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.205851e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.102555e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.330080e+07 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 1.334946 sec - 4,627,867,695 cycles # 2.954 GHz - 7,260,273,067 instructions # 1.57 insn per cycle - 1.624200407 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --common -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 166 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 4.665145 sec + 15,379,655,035 cycles:u # 3.280 GHz (74.93%) + 54,006,110 stalled-cycles-frontend:u # 0.35% frontend cycles idle (74.85%) + 6,900,380,307 stalled-cycles-backend:u # 44.87% backend cycles idle (74.99%) + 11,488,247,444 instructions:u # 0.75 insn per cycle + # 0.60 stalled cycles per insn (75.13%) + 4.716396464 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282804e-02 -Avg ME (F77/CUDA) = 1.2828039868165201E-002 -Relative difference = 1.0277080522138477e-08 +Avg ME (F77/CUDA) = 1.2828039868165208E-002 +Relative difference = 1.0277079981222336e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.028605e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.195713e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.195713e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.249184e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.428251e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.428251e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 6.891181 sec - 20,557,101,620 cycles # 2.982 GHz - 47,036,834,414 instructions # 2.29 insn per cycle - 6.897637957 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 472) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 5.763908 sec + 19,510,803,462 cycles:u # 3.368 GHz (75.01%) + 51,399,710 stalled-cycles-frontend:u # 0.26% frontend cycles idle (75.01%) + 63,104,773 stalled-cycles-backend:u # 0.32% backend cycles idle (75.01%) + 47,052,151,639 instructions:u # 2.41 insn per cycle + # 0.00 stalled cycles per insn (74.95%) + 5.796432681 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 471) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.626561e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.129016e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.129016e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.931734e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.436512e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.436512e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 4.623609 sec - 13,925,099,049 cycles # 3.010 GHz - 31,188,611,504 instructions # 2.24 insn per cycle - 4.629991459 seconds time elapsed +TOTAL : 3.982640 sec + 13,306,793,894 cycles:u # 3.317 GHz (74.89%) + 52,522,331 stalled-cycles-frontend:u # 0.39% frontend cycles idle (74.90%) + 1,016,515,877 stalled-cycles-backend:u # 7.64% backend cycles idle (74.99%) + 31,095,573,865 instructions:u # 2.34 insn per cycle + # 0.03 stalled cycles per insn (75.08%) + 4.014845336 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1626) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.050220e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.876170e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.876170e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.641894e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.520503e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.520503e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 3.827366 sec - 11,126,982,774 cycles # 2.903 GHz - 19,381,073,487 instructions # 1.74 insn per cycle - 3.833697052 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1964) (512y: 0) (512z: 0) +TOTAL : 3.118388 sec + 10,246,999,910 cycles:u # 3.255 GHz (74.84%) + 49,473,514 stalled-cycles-frontend:u # 0.48% frontend cycles idle (74.92%) + 443,528,455 stalled-cycles-backend:u # 4.33% backend cycles idle (75.04%) + 19,336,467,898 instructions:u # 1.89 insn per cycle + # 0.02 stalled cycles per insn (75.10%) + 3.150560934 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1946) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165090E-002 Relative difference = 1.0277089176796747e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.129207e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.079891e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.079891e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 3.733090 sec - 10,748,932,959 cycles # 2.877 GHz - 18,644,768,044 instructions # 1.73 insn per cycle - 3.739463223 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1655) (512y: 161) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165090E-002 -Relative difference = 1.0277089176796747e-08 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.941689e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.671330e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.671330e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 4.008583 sec - 9,299,039,128 cycles # 2.317 GHz - 15,211,947,853 instructions # 1.64 insn per cycle - 4.015036736 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 920) (512y: 59) (512z: 1220) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165090E-002 -Relative difference = 1.0277089176796747e-08 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_curhst.txt index 96a5734fdb..04f97e0270 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_curhst.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_curhst.txt @@ -6,8 +6,9 @@ AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' @@ -41,7 +42,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-02-02_17:19:50 +DATE: 2024-02-05_22:04:49 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +51,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.493577e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.598230e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.162874e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.500482e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.605097e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.137972e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 0.975356 sec - 3,544,940,501 cycles # 2.941 GHz - 7,060,681,723 instructions # 1.99 insn per cycle - 1.262676641 seconds time elapsed +TOTAL : 0.958819 sec + 3,612,761,264 cycles # 3.040 GHz + 7,190,638,907 instructions # 1.99 insn per cycle + 1.246173317 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --curhst WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 166 @@ -77,14 +78,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.031205e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.196634e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.196634e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.049359e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.218824e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.218824e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 6.499666 sec - 19,509,328,806 cycles # 3.000 GHz - 46,933,604,410 instructions # 2.41 insn per cycle - 6.506443087 seconds time elapsed +TOTAL : 6.388054 sec + 19,467,473,893 cycles # 3.046 GHz + 46,934,600,434 instructions # 2.41 insn per cycle + 6.394290908 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 472) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe @@ -104,14 +105,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.637240e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.143690e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.143690e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.659117e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.166540e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.166540e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.238907 sec - 12,808,787,625 cycles # 3.018 GHz - 31,182,997,723 instructions # 2.43 insn per cycle - 4.245173568 seconds time elapsed +TOTAL : 4.181185 sec + 12,809,069,138 cycles # 3.060 GHz + 31,183,695,070 instructions # 2.43 insn per cycle + 4.187406583 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1626) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe @@ -131,14 +132,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.049769e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.874788e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.874788e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.119833e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.977991e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.977991e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.468338 sec - 10,059,020,096 cycles # 2.896 GHz - 19,479,848,023 instructions # 1.94 insn per cycle - 3.474551673 seconds time elapsed +TOTAL : 3.357662 sec + 9,984,165,417 cycles # 2.969 GHz + 19,478,887,428 instructions # 1.95 insn per cycle + 3.363778767 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1964) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe @@ -158,14 +159,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.165075e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.101264e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.101264e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.227801e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.199728e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.199728e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.300926 sec - 9,573,220,141 cycles # 2.896 GHz - 18,942,234,299 instructions # 1.98 insn per cycle - 3.307231967 seconds time elapsed +TOTAL : 3.214378 sec + 9,537,291,787 cycles # 2.963 GHz + 18,941,744,436 instructions # 1.99 insn per cycle + 3.220442066 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1655) (512y: 161) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest.exe @@ -185,14 +186,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.946617e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.665765e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.665765e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.008509e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.773490e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.773490e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.638863 sec - 8,160,188,498 cycles # 2.241 GHz - 15,511,546,976 instructions # 1.90 insn per cycle - 3.645034588 seconds time elapsed +TOTAL : 3.524415 sec + 8,196,437,380 cycles # 2.322 GHz + 15,510,956,697 instructions # 1.89 insn per cycle + 3.530544205 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 920) (512y: 59) (512z: 1220) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt index 272523a1d1..78a5f84dc3 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt @@ -1,211 +1,169 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +HASCURAND=hasNoCurand +HASHIPRAND=hasHiprand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-02-02_17:16:28 +DATE: 2024-02-08_19:15:53 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 12 --rmbhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 12 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+MESDEV/none+NAVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.037906e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.538504e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.021580e+08 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 1.878698 sec - 6,255,263,118 cycles # 2.965 GHz - 11,446,239,176 instructions # 1.83 insn per cycle - 2.166766626 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -==PROF== Profiling "sigmaKin": launch__registers_per_thread 166 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 7.447010e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.077449e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.305009e+07 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 5.410601 sec + 17,946,066,362 cycles:u # 3.296 GHz (75.03%) + 120,567,509 stalled-cycles-frontend:u # 0.67% frontend cycles idle (75.04%) + 6,888,104,059 stalled-cycles-backend:u # 38.38% backend cycles idle (75.03%) + 16,750,524,596 instructions:u # 0.93 insn per cycle + # 0.41 stalled cycles per insn (75.01%) + 5.466546820 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282804e-02 -Avg ME (F77/CUDA) = 1.2828039868165201E-002 -Relative difference = 1.0277080522138477e-08 +Avg ME (F77/CUDA) = 1.2828039868165208E-002 +Relative difference = 1.0277079981222336e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.031880e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.198492e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.198492e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 6.500316 sec - 19,496,435,636 cycles # 2.998 GHz - 46,934,465,008 instructions # 2.41 insn per cycle - 6.506539601 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 472) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.249236e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.428795e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.428795e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 5.769202 sec + 19,521,833,039 cycles:u # 3.365 GHz (74.93%) + 50,351,159 stalled-cycles-frontend:u # 0.26% frontend cycles idle (75.00%) + 59,959,935 stalled-cycles-backend:u # 0.31% backend cycles idle (75.04%) + 47,022,405,845 instructions:u # 2.41 insn per cycle + # 0.00 stalled cycles per insn (75.04%) + 5.803333520 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 471) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.599472e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.094360e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.094360e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.339300 sec - 12,819,695,653 cycles # 2.952 GHz - 31,184,731,356 instructions # 2.43 insn per cycle - 4.345623570 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.935766e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.436965e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.436965e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 3.980612 sec + 13,283,927,789 cycles:u # 3.311 GHz (74.88%) + 52,410,542 stalled-cycles-frontend:u # 0.39% frontend cycles idle (74.90%) + 985,416,242 stalled-cycles-backend:u # 7.42% backend cycles idle (75.00%) + 31,113,123,354 instructions:u # 2.34 insn per cycle + # 0.03 stalled cycles per insn (75.08%) + 4.014719150 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1626) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.047764e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.875425e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.875425e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.471043 sec - 10,040,257,055 cycles # 2.889 GHz - 19,479,117,709 instructions # 1.94 insn per cycle - 3.477341967 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1964) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.663450e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.544894e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.544894e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 3.095489 sec + 10,131,189,853 cycles:u # 3.240 GHz (74.93%) + 48,075,304 stalled-cycles-frontend:u # 0.47% frontend cycles idle (74.95%) + 459,616,019 stalled-cycles-backend:u # 4.54% backend cycles idle (74.95%) + 19,408,429,710 instructions:u # 1.92 insn per cycle + # 0.02 stalled cycles per insn (74.93%) + 3.129267263 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1946) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165090E-002 Relative difference = 1.0277089176796747e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.171518e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.126780e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.126780e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.294735 sec - 9,565,516,137 cycles # 2.899 GHz - 18,941,970,742 instructions # 1.98 insn per cycle - 3.301187541 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1655) (512y: 161) (512z: 0) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165090E-002 -Relative difference = 1.0277089176796747e-08 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.952397e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.678466e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.678466e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.623568 sec - 8,156,521,447 cycles # 2.248 GHz - 15,510,993,062 instructions # 1.90 insn per cycle - 3.629777699 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 920) (512y: 59) (512z: 1220) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165090E-002 -Relative difference = 1.0277089176796747e-08 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt index d323cd06df..6410590fe2 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt @@ -1,209 +1,168 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +HASCURAND=hasNoCurand +HASHIPRAND=hasHiprand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd1' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-02-02_16:30:29 +DATE: 2024-02-08_18:17:02 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.572182e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.390685e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.194608e+08 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 0.685538 sec - 2,782,656,073 cycles # 3.012 GHz - 4,246,479,392 instructions # 1.53 insn per cycle - 0.998234046 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 154 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 5.910052e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.591643e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.913749e+07 ) sec^-1 +MeanMatrixElemValue = ( 1.371632e-02 +- 3.269165e-06 ) GeV^0 +TOTAL : 0.537799 sec + 1,246,252,190 cycles:u # 2.313 GHz (73.78%) + 2,375,538 stalled-cycles-frontend:u # 0.19% frontend cycles idle (74.48%) + 5,063,837 stalled-cycles-backend:u # 0.41% backend cycles idle (75.18%) + 2,025,449,879 instructions:u # 1.63 insn per cycle + # 0.00 stalled cycles per insn (74.67%) + 0.588688986 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282804e-02 -Avg ME (F77/CUDA) = 1.2828039868165206E-002 -Relative difference = 1.027708011645137e-08 +Avg ME (F77/CUDA) = 1.2828039868165216E-002 +Relative difference = 1.0277079305077159e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/check.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.127735e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.323902e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.323902e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 5.971708 sec - 18,439,818,189 cycles # 3.086 GHz - 44,717,274,583 instructions # 2.43 insn per cycle - 5.980312238 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 486) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.189137e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.367378e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.367378e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 6.063807 sec + 18,518,409,022 cycles:u # 3.040 GHz (74.93%) + 52,837,559 stalled-cycles-frontend:u # 0.29% frontend cycles idle (74.93%) + 49,068,178 stalled-cycles-backend:u # 0.26% backend cycles idle (75.00%) + 44,809,533,231 instructions:u # 2.42 insn per cycle + # 0.00 stalled cycles per insn (75.06%) + 6.095541198 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 485) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164921E-002 Relative difference = 1.0277102294013186e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/check.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.730545e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.290692e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.290692e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.027911 sec - 12,421,567,897 cycles # 3.079 GHz - 30,108,100,061 instructions # 2.42 insn per cycle - 4.045730632 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.815969e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.308592e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.308592e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 4.252278 sec + 12,725,814,930 cycles:u # 2.971 GHz (74.98%) + 52,143,435 stalled-cycles-frontend:u # 0.41% frontend cycles idle (74.98%) + 92,946,219 stalled-cycles-backend:u # 0.73% backend cycles idle (74.99%) + 30,141,549,743 instructions:u # 2.37 insn per cycle + # 0.00 stalled cycles per insn (74.99%) + 4.287489156 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1569) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164921E-002 Relative difference = 1.0277102294013186e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/check.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.082950e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.910776e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.910776e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.409589 sec - 10,081,155,779 cycles # 2.952 GHz - 19,114,889,377 instructions # 1.90 insn per cycle - 3.424316658 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1902) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.349782e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.096958e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.096958e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 3.462220 sec + 10,313,532,563 cycles:u # 2.952 GHz (74.88%) + 47,667,709 stalled-cycles-frontend:u # 0.46% frontend cycles idle (74.97%) + 297,463,767 stalled-cycles-backend:u # 2.88% backend cycles idle (75.04%) + 19,040,300,630 instructions:u # 1.85 insn per cycle + # 0.02 stalled cycles per insn (75.04%) + 3.497579708 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1884) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165093E-002 Relative difference = 1.0277088906338675e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd1/check.exe -p 2048 256 12 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.161465e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.121696e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.121696e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.314659 sec - 9,424,933,322 cycles # 2.839 GHz - 18,490,021,834 instructions # 1.96 insn per cycle - 3.331229686 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1576) (512y: 159) (512z: 0) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165093E-002 -Relative difference = 1.0277088906338675e-08 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd1/check.exe -p 2048 256 12 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.423529e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.621277e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.621277e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 2.989998 sec - 7,198,177,033 cycles # 2.403 GHz - 13,863,605,002 instructions # 1.93 insn per cycle - 3.008648384 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 818) (512y: 57) (512z: 898) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165093E-002 -Relative difference = 1.0277088906338675e-08 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt index 6abfecc259..7bd9ae25ca 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt @@ -1,209 +1,168 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +HASCURAND=hasNoCurand +HASHIPRAND=hasHiprand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_d_inl1_hrd0' +CUDACPP_BUILDDIR='build.avx2_d_inl1_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-02-02_16:58:29 +DATE: 2024-02-08_18:49:03 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/gcheck.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/gcheck.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.470891e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.608196e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.175441e+08 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 0.673015 sec - 2,673,416,682 cycles # 2.946 GHz - 4,155,190,412 instructions # 1.55 insn per cycle - 0.968662512 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/gcheck.exe -p 2048 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 166 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 5.313530e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.103673e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.331803e+07 ) sec^-1 +MeanMatrixElemValue = ( 1.371632e-02 +- 3.269165e-06 ) GeV^0 +TOTAL : 0.502044 sec + 1,327,766,940 cycles:u # 2.541 GHz (74.23%) + 2,288,038 stalled-cycles-frontend:u # 0.17% frontend cycles idle (74.26%) + 5,806,237 stalled-cycles-backend:u # 0.44% backend cycles idle (74.94%) + 2,064,482,988 instructions:u # 1.55 insn per cycle + # 0.00 stalled cycles per insn (75.49%) + 0.552230357 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282804e-02 -Avg ME (F77/CUDA) = 1.2828039868165201E-002 -Relative difference = 1.0277080522138477e-08 +Avg ME (F77/CUDA) = 1.2828039868165208E-002 +Relative difference = 1.0277079981222336e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/check.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.420469e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.751710e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.751710e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.831830 sec - 14,607,459,882 cycles # 3.021 GHz - 36,698,095,447 instructions # 2.51 insn per cycle - 4.838213031 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.760335e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.143552e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.143552e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 4.299727 sec + 14,264,485,997 cycles:u # 3.295 GHz (74.94%) + 51,585,853 stalled-cycles-frontend:u # 0.36% frontend cycles idle (75.03%) + 516,456,100 stalled-cycles-backend:u # 3.62% backend cycles idle (75.06%) + 36,766,031,380 instructions:u # 2.58 insn per cycle + # 0.01 stalled cycles per insn (75.06%) + 4.332006754 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 707) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/check.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.080104e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.961280e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.961280e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.421250 sec - 10,342,099,837 cycles # 3.018 GHz - 24,753,393,807 instructions # 2.39 insn per cycle - 3.427709695 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.396616e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.209079e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.209079e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 3.346787 sec + 10,977,650,608 cycles:u # 3.252 GHz (74.78%) + 52,064,364 stalled-cycles-frontend:u # 0.47% frontend cycles idle (74.95%) + 68,767,635 stalled-cycles-backend:u # 0.63% backend cycles idle (75.07%) + 24,719,153,812 instructions:u # 2.25 insn per cycle + # 0.00 stalled cycles per insn (75.12%) + 3.380096646 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2334) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/check.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.360911e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.552117e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.552117e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.060838 sec - 8,917,300,226 cycles # 2.909 GHz - 16,954,731,314 instructions # 1.90 insn per cycle - 3.067126118 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1604) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.966819e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.124832e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.124832e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 2.849245 sec + 9,188,509,383 cycles:u # 3.192 GHz (75.02%) + 48,362,362 stalled-cycles-frontend:u # 0.53% frontend cycles idle (74.99%) + 514,786,293 stalled-cycles-backend:u # 5.60% backend cycles idle (74.99%) + 16,821,404,609 instructions:u # 1.83 insn per cycle + # 0.03 stalled cycles per insn (74.99%) + 2.882688928 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1586) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165090E-002 Relative difference = 1.0277089176796747e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/check.exe -p 2048 256 12 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.552098e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.975324e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.975324e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 2.863523 sec - 8,346,304,365 cycles # 2.910 GHz - 16,297,690,711 instructions # 1.95 insn per cycle - 2.869819767 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2403) (512y: 292) (512z: 0) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165090E-002 -Relative difference = 1.0277089176796747e-08 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/check.exe -p 2048 256 12 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.138804e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.043053e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.043053e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.340990 sec - 7,692,387,899 cycles # 2.299 GHz - 14,352,863,379 instructions # 1.87 insn per cycle - 3.347829135 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 892) (512y: 63) (512z: 975) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165090E-002 -Relative difference = 1.0277089176796747e-08 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt index 00a3aeb9ee..fce8cfaf0d 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt @@ -1,209 +1,168 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +HASCURAND=hasNoCurand +HASHIPRAND=hasHiprand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_d_inl1_hrd1' +CUDACPP_BUILDDIR='build.avx2_d_inl1_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-02-02_16:59:01 +DATE: 2024-02-08_18:49:27 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/gcheck.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/gcheck.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.460480e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.603398e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.192348e+08 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 0.669878 sec - 2,672,223,071 cycles # 2.956 GHz - 4,118,263,895 instructions # 1.54 insn per cycle - 0.964425757 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/gcheck.exe -p 2048 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 154 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 5.942345e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.599595e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.922004e+07 ) sec^-1 +MeanMatrixElemValue = ( 1.371632e-02 +- 3.269165e-06 ) GeV^0 +TOTAL : 0.486783 sec + 1,253,643,250 cycles:u # 2.466 GHz (75.00%) + 2,246,551 stalled-cycles-frontend:u # 0.18% frontend cycles idle (74.98%) + 5,266,359 stalled-cycles-backend:u # 0.42% backend cycles idle (74.91%) + 1,994,045,893 instructions:u # 1.59 insn per cycle + # 0.00 stalled cycles per insn (74.77%) + 0.536081700 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282804e-02 -Avg ME (F77/CUDA) = 1.2828039868165206E-002 -Relative difference = 1.027708011645137e-08 +Avg ME (F77/CUDA) = 1.2828039868165216E-002 +Relative difference = 1.0277079305077159e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/check.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.993214e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.709292e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.709292e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.554549 sec - 10,766,799,093 cycles # 3.025 GHz - 28,354,945,748 instructions # 2.63 insn per cycle - 3.560735285 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.414536e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.195615e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.195615e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 3.329056 sec + 10,883,365,540 cycles:u # 3.241 GHz (74.99%) + 50,913,166 stalled-cycles-frontend:u # 0.47% frontend cycles idle (74.99%) + 65,828,742 stalled-cycles-backend:u # 0.60% backend cycles idle (74.99%) + 28,550,737,411 instructions:u # 2.62 insn per cycle + # 0.00 stalled cycles per insn (75.01%) + 3.361278232 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 600) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164921E-002 Relative difference = 1.0277102294013186e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/check.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.354629e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.550684e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.550684e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.068461 sec - 9,247,182,211 cycles # 3.009 GHz - 21,586,461,780 instructions # 2.33 insn per cycle - 3.074519229 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.612188e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.618351e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.618351e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 3.131632 sec + 10,129,423,232 cycles:u # 3.205 GHz (74.97%) + 48,255,673 stalled-cycles-frontend:u # 0.48% frontend cycles idle (74.97%) + 71,858,708 stalled-cycles-backend:u # 0.71% backend cycles idle (74.97%) + 21,702,694,393 instructions:u # 2.14 insn per cycle + # 0.00 stalled cycles per insn (74.82%) + 3.164603233 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2117) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164921E-002 Relative difference = 1.0277102294013186e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/check.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.499694e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.838826e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.838826e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 2.918296 sec - 8,395,839,366 cycles # 2.872 GHz - 15,943,675,133 instructions # 1.90 insn per cycle - 2.924421973 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1497) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 3.260862e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.707458e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.707458e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 2.658223 sec + 8,544,190,465 cycles:u # 3.178 GHz (74.97%) + 48,674,787 stalled-cycles-frontend:u # 0.57% frontend cycles idle (75.01%) + 133,578,012 stalled-cycles-backend:u # 1.56% backend cycles idle (75.01%) + 15,840,793,330 instructions:u # 1.85 insn per cycle + # 0.01 stalled cycles per insn (75.01%) + 2.692235669 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1479) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165093E-002 Relative difference = 1.0277088906338675e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd1/check.exe -p 2048 256 12 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.615544e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.200945e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.200945e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 2.819210 sec - 7,873,801,649 cycles # 2.790 GHz - 15,370,972,545 instructions # 1.95 insn per cycle - 2.825473468 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2179) (512y: 307) (512z: 0) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165093E-002 -Relative difference = 1.0277088906338675e-08 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd1/check.exe -p 2048 256 12 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.250725e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.273167e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.273167e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.196351 sec - 7,362,907,139 cycles # 2.300 GHz - 13,880,492,959 instructions # 1.89 insn per cycle - 3.202772131 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 853) (512y: 69) (512z: 905) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165093E-002 -Relative difference = 1.0277088906338675e-08 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt index cc5d5d6a08..5d6584ccba 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt @@ -1,209 +1,168 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +HASCURAND=hasNoCurand +HASHIPRAND=hasHiprand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-02-02_16:31:01 +DATE: 2024-02-08_18:17:31 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.172627e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.199322e+09 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.283330e+09 ) sec^-1 -MeanMatrixElemValue = ( 1.371687e-02 +- 3.270220e-06 ) GeV^0 -TOTAL : 0.573086 sec - 2,416,798,560 cycles # 3.011 GHz - 3,754,207,790 instructions # 1.55 insn per cycle - 0.877602394 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 117 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=1, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 1.916462e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.225716e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.982310e+08 ) sec^-1 +MeanMatrixElemValue = ( 1.372027e-02 +- 3.270769e-06 ) GeV^0 +TOTAL : 0.414034 sec + 937,519,241 cycles:u # 2.165 GHz (74.85%) + 2,159,096 stalled-cycles-frontend:u # 0.23% frontend cycles idle (74.76%) + 5,666,910 stalled-cycles-backend:u # 0.60% backend cycles idle (75.20%) + 1,814,478,545 instructions:u # 1.94 insn per cycle + # 0.00 stalled cycles per insn (75.92%) + 0.461552135 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282802e-02 -Avg ME (F77/CUDA) = 1.2828112125134794E-002 -Relative difference = 7.1815552823662555e-06 +Avg ME (F77/CUDA) = 1.2828036033170065E-002 +Relative difference = 1.2498553996774023e-06 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.116439e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.313811e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.313811e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 5.991445 sec - 18,557,682,273 cycles # 3.095 GHz - 47,046,241,172 instructions # 2.54 insn per cycle - 6.000725208 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 542) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.274551e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.469484e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.469484e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371887e-02 +- 3.270267e-06 ) GeV^0 +TOTAL : 5.640760 sec + 17,255,169,189 cycles:u # 3.045 GHz (74.95%) + 38,908,929 stalled-cycles-frontend:u # 0.23% frontend cycles idle (75.02%) + 34,845,885 stalled-cycles-backend:u # 0.20% backend cycles idle (75.02%) + 47,149,536,158 instructions:u # 2.73 insn per cycle + # 0.00 stalled cycles per insn (75.02%) + 5.670509554 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 541) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039441956207E-002 -Relative difference = 4.35018750695023e-08 +Avg ME (F77/C++) = 1.2828039569285465E-002 +Relative difference = 3.357602059382168e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.379901e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.641666e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.641666e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 2.992749 sec - 9,233,228,495 cycles # 3.079 GHz - 22,092,197,385 instructions # 2.39 insn per cycle - 3.005213085 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.698799e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.860879e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.860879e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371887e-02 +- 3.270266e-06 ) GeV^0 +TOTAL : 3.065307 sec + 9,186,630,006 cycles:u # 2.971 GHz (74.94%) + 40,710,785 stalled-cycles-frontend:u # 0.44% frontend cycles idle (74.91%) + 649,259,763 stalled-cycles-backend:u # 7.07% backend cycles idle (74.93%) + 22,235,538,357 instructions:u # 2.42 insn per cycle + # 0.03 stalled cycles per insn (74.93%) + 3.096130434 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1883) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039280066150E-002 -Relative difference = 5.612189004572479e-08 +Avg ME (F77/C++) = 1.2828039385567536E-002 +Relative difference = 4.7897610623017996e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.555800e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.962236e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.962236e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.826275 sec - 8,191,236,644 cycles # 2.894 GHz - 15,625,311,974 instructions # 1.91 insn per cycle - 2.843388117 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2619) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053255361738E-002 -Relative difference = 2.5376902468575066e-07 -OK (relative difference <= 5E-3) +EvtsPerSec[Rmb+ME] (23) = ( 3.083899e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.517332e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.517332e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 +TOTAL : 2.771129 sec + 8,184,044,617 cycles:u # 2.924 GHz (74.90%) + 43,196,009 stalled-cycles-frontend:u # 0.53% frontend cycles idle (74.87%) + 1,429,360,940 stalled-cycles-backend:u # 17.47% backend cycles idle (74.87%) + 15,580,671,066 instructions:u # 1.90 insn per cycle + # 0.09 stalled cycles per insn (74.95%) + 2.802871026 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2601) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe -p 2048 256 12 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.731269e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.368092e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.368092e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.657137 sec - 7,886,745,126 cycles # 2.962 GHz - 15,296,514,202 instructions # 1.94 insn per cycle - 2.674160319 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2414) (512y: 13) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053255361738E-002 -Relative difference = 2.5376902468575066e-07 +Avg ME (F77/C++) = 1.2828053369958070E-002 +Relative difference = 2.627022867500074e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe -p 2048 256 12 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.750373e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.358704e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.358704e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 -TOTAL : 2.641515 sec - 6,407,621,369 cycles # 2.421 GHz - 12,623,306,303 instructions # 1.97 insn per cycle - 2.655723578 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1615) (512y: 12) (512z: 1404) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828052589611616E-002 -Relative difference = 2.0187102602673518e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt index dd941f7ce9..f07a388ac4 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt @@ -1,222 +1,175 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +HASCURAND=hasNoCurand +HASHIPRAND=hasHiprand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-02-02_17:10:18 +DATE: 2024-02-08_19:08:40 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 12 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 12 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.169451e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.471060e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.471060e+07 ) sec^-1 -MeanMatrixElemValue = ( 1.371710e-02 +- 3.270389e-06 ) GeV^0 -TOTAL : 1.683599 sec - 5,675,815,822 cycles # 2.966 GHz - 10,284,516,165 instructions # 1.81 insn per cycle - 1.970153509 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -==PROF== Profiling "sigmaKin": launch__registers_per_thread 117 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 7.601612e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.308011e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.308011e+08 ) sec^-1 +MeanMatrixElemValue = ( 1.371886e-02 +- 3.270260e-06 ) GeV^0 +TOTAL : 5.356447 sec + 17,736,085,096 cycles:u # 3.294 GHz (74.99%) + 116,600,593 stalled-cycles-frontend:u # 0.66% frontend cycles idle (74.99%) + 6,936,520,666 stalled-cycles-backend:u # 39.11% backend cycles idle (75.04%) + 17,072,655,798 instructions:u # 0.96 insn per cycle + # 0.41 stalled cycles per insn (75.05%) + 5.411029879 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282802e-02 -Avg ME (F77/CUDA) = 1.2828112125134794E-002 -Relative difference = 7.1815552823662555e-06 +Avg ME (F77/CUDA) = 1.2828036033170065E-002 +Relative difference = 1.2498553996774023e-06 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.061837e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.244243e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.244243e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 6.393928 sec - 19,212,157,842 cycles # 3.002 GHz - 47,195,254,033 instructions # 2.46 insn per cycle - 6.401445537 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 542) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.413194e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.639057e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.639057e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371887e-02 +- 3.270267e-06 ) GeV^0 +TOTAL : 5.183772 sec + 17,450,185,792 cycles:u # 3.345 GHz (75.01%) + 39,608,463 stalled-cycles-frontend:u # 0.23% frontend cycles idle (75.00%) + 44,336,860 stalled-cycles-backend:u # 0.25% backend cycles idle (75.00%) + 47,390,057,919 instructions:u # 2.72 insn per cycle + # 0.00 stalled cycles per insn (74.94%) + 5.218741499 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 541) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039441956207E-002 -Relative difference = 4.35018750695023e-08 +Avg ME (F77/C++) = 1.2828039569285465E-002 +Relative difference = 3.357602059382168e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.235655e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.344799e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.344799e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 3.292289 sec - 9,984,375,424 cycles # 3.027 GHz - 23,429,323,761 instructions # 2.35 insn per cycle - 3.299324497 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.873566e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.045756e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.045756e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371887e-02 +- 3.270266e-06 ) GeV^0 +TOTAL : 2.955526 sec + 9,625,837,302 cycles:u # 3.220 GHz (74.85%) + 41,879,056 stalled-cycles-frontend:u # 0.44% frontend cycles idle (74.85%) + 646,421,280 stalled-cycles-backend:u # 6.72% backend cycles idle (74.98%) + 23,590,603,216 instructions:u # 2.45 insn per cycle + # 0.03 stalled cycles per insn (75.11%) + 2.992557705 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1883) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039280066150E-002 -Relative difference = 5.612189004572479e-08 +Avg ME (F77/C++) = 1.2828039385567536E-002 +Relative difference = 4.7897610623017996e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.455698e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.743704e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.743704e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 3.046057 sec - 8,936,042,448 cycles # 2.928 GHz - 16,750,997,250 instructions # 1.87 insn per cycle - 3.053264860 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2619) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053255361738E-002 -Relative difference = 2.5376902468575066e-07 -OK (relative difference <= 5E-3) +EvtsPerSec[Rmb+ME] (23) = ( 3.297838e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.817354e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.817354e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 +TOTAL : 2.668494 sec + 8,641,490,092 cycles:u # 3.199 GHz (74.83%) + 41,544,998 stalled-cycles-frontend:u # 0.48% frontend cycles idle (74.83%) + 1,454,939,462 stalled-cycles-backend:u # 16.84% backend cycles idle (74.96%) + 16,609,731,375 instructions:u # 1.92 insn per cycle + # 0.09 stalled cycles per insn (75.11%) + 2.705371701 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2601) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.554336e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.982800e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.982800e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.949996 sec - 8,649,926,207 cycles # 2.928 GHz - 16,423,610,885 instructions # 1.90 insn per cycle - 2.957039248 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2414) (512y: 13) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053255361738E-002 -Relative difference = 2.5376902468575066e-07 +Avg ME (F77/C++) = 1.2828053369958070E-002 +Relative difference = 2.627022867500074e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.556555e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.919638e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.919638e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 -TOTAL : 2.943833 sec - 7,178,442,881 cycles # 2.434 GHz - 13,849,630,832 instructions # 1.93 insn per cycle - 2.950865155 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1615) (512y: 12) (512z: 1404) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828052589611616E-002 -Relative difference = 2.0187102602673518e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt index 916b9fab00..5ec4d4ab6c 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt @@ -1,209 +1,165 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +HASCURAND=hasNoCurand +HASHIPRAND=hasHiprand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-02-02_17:23:46 +DATE: 2024-02-08_19:20:09 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 12 --common OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 12 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.305858e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.176187e+09 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.243282e+09 ) sec^-1 -MeanMatrixElemValue = ( 1.371863e-02 +- 3.269951e-06 ) GeV^0 -TOTAL : 1.174569 sec - 4,137,221,599 cycles # 2.964 GHz - 6,628,706,350 instructions # 1.60 insn per cycle - 1.453756616 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --common -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 117 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 1.802337e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.196569e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.957531e+08 ) sec^-1 +MeanMatrixElemValue = ( 1.371895e-02 +- 3.272985e-06 ) GeV^0 +TOTAL : 4.536180 sec + 15,016,874,212 cycles:u # 3.295 GHz (74.91%) + 53,693,692 stalled-cycles-frontend:u # 0.36% frontend cycles idle (74.93%) + 7,009,068,488 stalled-cycles-backend:u # 46.67% backend cycles idle (75.07%) + 11,008,824,349 instructions:u # 0.73 insn per cycle + # 0.64 stalled cycles per insn (75.07%) + 4.580610638 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282802e-02 -Avg ME (F77/CUDA) = 1.2828112125134794E-002 -Relative difference = 7.1815552823662555e-06 +Avg ME (F77/CUDA) = 1.2828036033170065E-002 +Relative difference = 1.2498553996774023e-06 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.074840e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.263721e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.263721e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.424112e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.654453e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.654453e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371887e-02 +- 3.270267e-06 ) GeV^0 -TOTAL : 6.551388 sec - 19,561,947,739 cycles # 2.984 GHz - 47,228,101,461 instructions # 2.41 insn per cycle - 6.557477925 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 542) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 5.087804 sec + 17,240,457,051 cycles:u # 3.371 GHz (74.97%) + 39,475,809 stalled-cycles-frontend:u # 0.23% frontend cycles idle (74.97%) + 39,235,588 stalled-cycles-backend:u # 0.23% backend cycles idle (74.97%) + 47,225,654,656 instructions:u # 2.74 insn per cycle + # 0.00 stalled cycles per insn (74.99%) + 5.116304782 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 541) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039441956207E-002 -Relative difference = 4.35018750695023e-08 +Avg ME (F77/C++) = 1.2828039569285465E-002 +Relative difference = 3.357602059382168e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.305987e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.540160e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.540160e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.921746e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.147753e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.147753e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371887e-02 +- 3.270266e-06 ) GeV^0 -TOTAL : 3.424686 sec - 10,266,744,074 cycles # 2.994 GHz - 22,174,524,084 instructions # 2.16 insn per cycle - 3.430655977 seconds time elapsed +TOTAL : 2.830451 sec + 9,319,060,198 cycles:u # 3.261 GHz (74.93%) + 41,069,989 stalled-cycles-frontend:u # 0.44% frontend cycles idle (75.09%) + 645,303,255 stalled-cycles-backend:u # 6.92% backend cycles idle (75.09%) + 22,133,332,086 instructions:u # 2.38 insn per cycle + # 0.03 stalled cycles per insn (75.09%) + 2.859939720 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1883) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039280066150E-002 -Relative difference = 5.612189004572479e-08 +Avg ME (F77/C++) = 1.2828039385567536E-002 +Relative difference = 4.7897610623017996e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.563191e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.993149e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.993149e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.415975e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.004327e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.004327e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 -TOTAL : 3.152202 sec - 9,193,580,603 cycles # 2.913 GHz - 15,537,306,775 instructions # 1.69 insn per cycle - 3.158288262 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2619) (512y: 0) (512z: 0) +TOTAL : 2.511277 sec + 8,232,656,213 cycles:u # 3.245 GHz (74.82%) + 42,044,882 stalled-cycles-frontend:u # 0.51% frontend cycles idle (74.95%) + 1,462,340,926 stalled-cycles-backend:u # 17.76% backend cycles idle (75.09%) + 15,504,031,361 instructions:u # 1.88 insn per cycle + # 0.09 stalled cycles per insn (75.09%) + 2.539529796 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2601) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053255361738E-002 -Relative difference = 2.5376902468575066e-07 +Avg ME (F77/C++) = 1.2828053369958070E-002 +Relative difference = 2.627022867500074e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.658602e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.279018e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.279018e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 -TOTAL : 3.066058 sec - 8,964,510,869 cycles # 2.919 GHz - 15,006,664,372 instructions # 1.67 insn per cycle - 3.072231903 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2414) (512y: 13) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053255361738E-002 -Relative difference = 2.5376902468575066e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.663013e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.206015e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.206015e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 -TOTAL : 3.063222 sec - 7,429,065,971 cycles # 2.422 GHz - 12,333,291,202 instructions # 1.66 insn per cycle - 3.069077659 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1615) (512y: 12) (512z: 1404) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828052589611616E-002 -Relative difference = 2.0187102602673518e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_curhst.txt index 09b570c231..43d8f0743f 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_curhst.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_curhst.txt @@ -6,8 +6,9 @@ AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' @@ -41,7 +42,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-02-02_17:20:24 +DATE: 2024-02-05_22:05:23 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +51,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.304852e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.188459e+09 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.289386e+09 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.313608e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.194062e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.294541e+09 ) sec^-1 MeanMatrixElemValue = ( 1.371687e-02 +- 3.270220e-06 ) GeV^0 -TOTAL : 0.846896 sec - 3,152,447,187 cycles # 2.954 GHz - 6,399,531,397 instructions # 2.03 insn per cycle - 1.125590753 seconds time elapsed +TOTAL : 0.837839 sec + 3,221,458,304 cycles # 3.025 GHz + 6,460,874,497 instructions # 2.01 insn per cycle + 1.122134673 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --curhst WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 117 @@ -77,14 +78,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.090733e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.281870e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.281870e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.114925e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.312637e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.312637e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 6.131573 sec - 18,559,746,055 cycles # 3.025 GHz - 47,046,615,294 instructions # 2.53 insn per cycle - 6.137716677 seconds time elapsed +TOTAL : 5.995075 sec + 18,550,724,282 cycles # 3.092 GHz + 47,045,610,283 instructions # 2.54 insn per cycle + 6.000876182 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 542) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe @@ -104,14 +105,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.336953e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.576039e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.576039e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.328720e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.561231e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.561231e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 3.050563 sec - 9,242,499,904 cycles # 3.025 GHz - 22,091,627,720 instructions # 2.39 insn per cycle - 3.056791767 seconds time elapsed +TOTAL : 3.060634 sec + 9,255,866,935 cycles # 3.027 GHz + 22,095,165,932 instructions # 2.39 insn per cycle + 3.066453520 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1883) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe @@ -131,14 +132,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.501830e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.877547e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.877547e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.644282e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.130925e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.130925e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.885737 sec - 8,156,328,148 cycles # 2.822 GHz - 15,624,590,007 instructions # 1.92 insn per cycle - 2.891980770 seconds time elapsed +TOTAL : 2.731302 sec + 8,179,125,857 cycles # 2.990 GHz + 15,624,271,395 instructions # 1.91 insn per cycle + 2.737082415 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2619) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe @@ -158,14 +159,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.608973e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.155332e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.155332e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.755995e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.398064e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.398064e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.781607 sec - 7,877,118,719 cycles # 2.834 GHz - 15,299,796,256 instructions # 1.94 insn per cycle - 2.787750292 seconds time elapsed +TOTAL : 2.631869 sec + 7,862,262,943 cycles # 2.982 GHz + 15,296,823,766 instructions # 1.95 insn per cycle + 2.637847648 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2414) (512y: 13) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest.exe @@ -185,14 +186,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.679159e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.253519e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.253519e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.790427e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.441377e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.441377e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 -TOTAL : 2.709877 sec - 6,441,740,307 cycles # 2.373 GHz - 12,623,177,096 instructions # 1.96 insn per cycle - 2.715857497 seconds time elapsed +TOTAL : 2.608833 sec + 6,396,610,736 cycles # 2.448 GHz + 12,622,932,331 instructions # 1.97 insn per cycle + 2.614856919 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1615) (512y: 12) (512z: 1404) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt index becab2fe0f..084018ae7a 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt @@ -1,211 +1,169 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +HASCURAND=hasNoCurand +HASHIPRAND=hasHiprand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-02-02_17:17:03 +DATE: 2024-02-08_19:16:25 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 12 --rmbhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 12 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+MESDEV/none+NAVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.818927e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.140625e+09 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.137667e+09 ) sec^-1 -MeanMatrixElemValue = ( 1.371710e-02 +- 3.270389e-06 ) GeV^0 -TOTAL : 1.492821 sec - 5,106,654,416 cycles # 2.979 GHz - 9,234,091,370 instructions # 1.81 insn per cycle - 1.772743442 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -==PROF== Profiling "sigmaKin": launch__registers_per_thread 117 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 8.352208e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.992270e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.700820e+08 ) sec^-1 +MeanMatrixElemValue = ( 1.371886e-02 +- 3.270260e-06 ) GeV^0 +TOTAL : 5.295666 sec + 17,656,305,360 cycles:u # 3.317 GHz (74.94%) + 117,895,335 stalled-cycles-frontend:u # 0.67% frontend cycles idle (74.91%) + 6,965,456,924 stalled-cycles-backend:u # 39.45% backend cycles idle (74.95%) + 16,720,237,335 instructions:u # 0.95 insn per cycle + # 0.42 stalled cycles per insn (75.04%) + 5.346362564 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282802e-02 -Avg ME (F77/CUDA) = 1.2828112125134794E-002 -Relative difference = 7.1815552823662555e-06 +Avg ME (F77/CUDA) = 1.2828036033170065E-002 +Relative difference = 1.2498553996774023e-06 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.076052e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.265481e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.265481e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 6.212858 sec - 18,597,442,476 cycles # 2.995 GHz - 47,049,595,143 instructions # 2.53 insn per cycle - 6.218975920 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 542) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.425932e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.655535e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.655535e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371887e-02 +- 3.270267e-06 ) GeV^0 +TOTAL : 5.086483 sec + 17,217,885,511 cycles:u # 3.367 GHz (74.97%) + 39,329,051 stalled-cycles-frontend:u # 0.23% frontend cycles idle (74.97%) + 38,892,111 stalled-cycles-backend:u # 0.23% backend cycles idle (74.98%) + 47,250,198,154 instructions:u # 2.74 insn per cycle + # 0.00 stalled cycles per insn (74.98%) + 5.115965831 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 541) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039441956207E-002 -Relative difference = 4.35018750695023e-08 +Avg ME (F77/C++) = 1.2828039569285465E-002 +Relative difference = 3.357602059382168e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.335657e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.570735e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.570735e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 3.053150 sec - 9,218,466,968 cycles # 3.015 GHz - 22,091,551,341 instructions # 2.40 insn per cycle - 3.059217010 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.929891e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.150810e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.150810e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371887e-02 +- 3.270266e-06 ) GeV^0 +TOTAL : 2.824875 sec + 9,281,749,018 cycles:u # 3.255 GHz (75.00%) + 40,736,236 stalled-cycles-frontend:u # 0.44% frontend cycles idle (75.04%) + 617,474,929 stalled-cycles-backend:u # 6.65% backend cycles idle (75.04%) + 22,147,277,727 instructions:u # 2.39 insn per cycle + # 0.03 stalled cycles per insn (75.04%) + 2.854201201 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1883) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039280066150E-002 -Relative difference = 5.612189004572479e-08 +Avg ME (F77/C++) = 1.2828039385567536E-002 +Relative difference = 4.7897610623017996e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.563651e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.985728e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.985728e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.819573 sec - 8,172,497,199 cycles # 2.894 GHz - 15,625,651,168 instructions # 1.91 insn per cycle - 2.825655579 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2619) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 3.420886e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.010214e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.010214e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 +TOTAL : 2.515060 sec + 8,209,191,572 cycles:u # 3.230 GHz (74.85%) + 42,338,926 stalled-cycles-frontend:u # 0.52% frontend cycles idle (74.85%) + 1,453,034,382 stalled-cycles-backend:u # 17.70% backend cycles idle (74.89%) + 15,532,414,795 instructions:u # 1.89 insn per cycle + # 0.09 stalled cycles per insn (75.05%) + 2.544513289 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2601) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053255361738E-002 -Relative difference = 2.5376902468575066e-07 +Avg ME (F77/C++) = 1.2828053369958070E-002 +Relative difference = 2.627022867500074e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.685211e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.288179e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.288179e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.700852 sec - 7,860,982,842 cycles # 2.905 GHz - 15,296,030,854 instructions # 1.95 insn per cycle - 2.706922728 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2414) (512y: 13) (512z: 0) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053255361738E-002 -Relative difference = 2.5376902468575066e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.678776e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.232666e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.232666e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 -TOTAL : 2.710793 sec - 6,408,231,928 cycles # 2.360 GHz - 12,623,114,100 instructions # 1.97 insn per cycle - 2.716743452 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1615) (512y: 12) (512z: 1404) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828052589611616E-002 -Relative difference = 2.0187102602673518e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt index b62bccc72b..7370ecd908 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt @@ -1,209 +1,168 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +HASCURAND=hasNoCurand +HASHIPRAND=hasHiprand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd1' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-02-02_16:31:32 +DATE: 2024-02-08_18:17:57 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.166262e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.228331e+09 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.344578e+09 ) sec^-1 -MeanMatrixElemValue = ( 1.371687e-02 +- 3.270220e-06 ) GeV^0 -TOTAL : 0.579979 sec - 2,328,759,612 cycles # 2.885 GHz - 3,643,533,967 instructions # 1.56 insn per cycle - 0.876244169 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 95 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=1, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 1.914450e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.243059e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.013320e+08 ) sec^-1 +MeanMatrixElemValue = ( 1.372027e-02 +- 3.270769e-06 ) GeV^0 +TOTAL : 0.409693 sec + 939,976,117 cycles:u # 2.194 GHz (73.83%) + 2,210,035 stalled-cycles-frontend:u # 0.24% frontend cycles idle (74.72%) + 5,004,797 stalled-cycles-backend:u # 0.53% backend cycles idle (75.80%) + 1,809,251,298 instructions:u # 1.92 insn per cycle + # 0.00 stalled cycles per insn (75.67%) + 0.456092705 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282802e-02 -Avg ME (F77/CUDA) = 1.2828112125134794E-002 -Relative difference = 7.1815552823662555e-06 +Avg ME (F77/CUDA) = 1.2828036033170065E-002 +Relative difference = 1.2498553996774023e-06 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/check.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.116347e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.323728e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.323728e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 6.003939 sec - 17,734,646,388 cycles # 2.952 GHz - 43,888,539,389 instructions # 2.47 insn per cycle - 6.012704487 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 467) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.393713e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.641850e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.641850e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371887e-02 +- 3.270267e-06 ) GeV^0 +TOTAL : 5.245299 sec + 15,999,006,907 cycles:u # 3.034 GHz (74.97%) + 40,433,716 stalled-cycles-frontend:u # 0.25% frontend cycles idle (74.97%) + 28,298,756 stalled-cycles-backend:u # 0.18% backend cycles idle (74.98%) + 44,060,621,572 instructions:u # 2.75 insn per cycle + # 0.00 stalled cycles per insn (74.98%) + 5.275168966 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 466) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039441956207E-002 -Relative difference = 4.35018750695023e-08 +Avg ME (F77/C++) = 1.2828039569285465E-002 +Relative difference = 3.357602059382168e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/check.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.363202e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.659809e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.659809e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 3.023784 sec - 9,025,879,023 cycles # 2.979 GHz - 21,581,883,686 instructions # 2.39 insn per cycle - 3.037037719 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.750362e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.958887e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.958887e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371887e-02 +- 3.270266e-06 ) GeV^0 +TOTAL : 3.024286 sec + 8,985,652,165 cycles:u # 2.945 GHz (74.88%) + 41,957,731 stalled-cycles-frontend:u # 0.47% frontend cycles idle (74.85%) + 136,530,468 stalled-cycles-backend:u # 1.52% backend cycles idle (74.94%) + 21,748,114,495 instructions:u # 2.42 insn per cycle + # 0.01 stalled cycles per insn (75.07%) + 3.055192320 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1827) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039280066150E-002 -Relative difference = 5.612189004572479e-08 +Avg ME (F77/C++) = 1.2828039385567536E-002 +Relative difference = 4.7897610623017996e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/check.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.517437e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.926566e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.926566e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.869514 sec - 8,114,381,669 cycles # 2.822 GHz - 15,430,189,803 instructions # 1.90 insn per cycle - 2.880961397 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2542) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053255361738E-002 -Relative difference = 2.5376902468575066e-07 -OK (relative difference <= 5E-3) +EvtsPerSec[Rmb+ME] (23) = ( 3.132254e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.620812e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.620812e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 +TOTAL : 2.740350 sec + 8,097,875,361 cycles:u # 2.925 GHz (74.88%) + 43,314,820 stalled-cycles-frontend:u # 0.53% frontend cycles idle (74.88%) + 1,759,136,170 stalled-cycles-backend:u # 21.72% backend cycles idle (74.94%) + 15,336,449,761 instructions:u # 1.89 insn per cycle + # 0.11 stalled cycles per insn (75.09%) + 2.772292539 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2524) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd1/check.exe -p 2048 256 12 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.623245e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.244709e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.244709e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.761284 sec - 7,902,083,853 cycles # 2.856 GHz - 15,086,749,902 instructions # 1.91 insn per cycle - 2.775513939 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2323) (512y: 15) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053255361738E-002 -Relative difference = 2.5376902468575066e-07 +Avg ME (F77/C++) = 1.2828053369958070E-002 +Relative difference = 2.627022867500074e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/check.exe -p 2048 256 12 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.640062e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.253768e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.253768e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 -TOTAL : 2.763339 sec - 6,167,048,554 cycles # 2.227 GHz - 12,244,798,321 instructions # 1.99 insn per cycle - 2.776809715 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1538) (512y: 8) (512z: 1258) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828052431359538E-002 -Relative difference = 1.895346165094282e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt index 9e1d2d7d02..892dd28020 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt @@ -1,209 +1,168 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +HASCURAND=hasNoCurand +HASHIPRAND=hasHiprand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_f_inl1_hrd0' +CUDACPP_BUILDDIR='build.avx2_f_inl1_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-02-02_16:59:30 +DATE: 2024-02-08_18:49:54 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/gcheck.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/gcheck.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.294853e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.190192e+09 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.272408e+09 ) sec^-1 -MeanMatrixElemValue = ( 1.371687e-02 +- 3.270220e-06 ) GeV^0 -TOTAL : 0.566411 sec - 2,320,420,364 cycles # 2.936 GHz - 3,656,536,300 instructions # 1.58 insn per cycle - 0.849896107 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/gcheck.exe -p 2048 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 117 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=1, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 1.916931e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.213424e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.976795e+08 ) sec^-1 +MeanMatrixElemValue = ( 1.372027e-02 +- 3.270769e-06 ) GeV^0 +TOTAL : 0.380532 sec + 928,460,266 cycles:u # 2.336 GHz (74.61%) + 2,128,809 stalled-cycles-frontend:u # 0.23% frontend cycles idle (75.93%) + 5,749,211 stalled-cycles-backend:u # 0.62% backend cycles idle (75.91%) + 1,808,802,965 instructions:u # 1.95 insn per cycle + # 0.00 stalled cycles per insn (75.99%) + 0.426363917 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282802e-02 -Avg ME (F77/CUDA) = 1.2828112125134794E-002 -Relative difference = 7.1815552823662555e-06 +Avg ME (F77/CUDA) = 1.2828036033170065E-002 +Relative difference = 1.2498553996774023e-06 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/check.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.453099e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.830259e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.830259e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 4.693929 sec - 13,775,897,740 cycles # 2.932 GHz - 37,848,679,682 instructions # 2.75 insn per cycle - 4.700073558 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.897857e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.340213e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.340213e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371887e-02 +- 3.270267e-06 ) GeV^0 +TOTAL : 3.989075 sec + 13,209,705,815 cycles:u # 3.290 GHz (74.90%) + 39,659,904 stalled-cycles-frontend:u # 0.30% frontend cycles idle (74.91%) + 1,255,460,951 stalled-cycles-backend:u # 9.50% backend cycles idle (74.92%) + 38,009,003,114 instructions:u # 2.88 insn per cycle + # 0.03 stalled cycles per insn (75.01%) + 4.018523538 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 833) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039414671366E-002 -Relative difference = 4.562884388571957e-08 +Avg ME (F77/C++) = 1.2828039543819614E-002 +Relative difference = 3.5561191488957804e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/check.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.783255e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.752995e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.752995e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 2.617239 sec - 7,913,140,975 cycles # 3.018 GHz - 18,602,943,912 instructions # 2.35 insn per cycle - 2.623349513 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.447145e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.299888e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.299888e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371887e-02 +- 3.270266e-06 ) GeV^0 +TOTAL : 2.499550 sec + 8,065,061,748 cycles:u # 3.192 GHz (74.99%) + 41,243,845 stalled-cycles-frontend:u # 0.51% frontend cycles idle (74.99%) + 240,300,906 stalled-cycles-backend:u # 2.98% backend cycles idle (74.99%) + 18,644,458,983 instructions:u # 2.31 insn per cycle + # 0.01 stalled cycles per insn (75.01%) + 2.530724959 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2808) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039280066150E-002 -Relative difference = 5.612189004572479e-08 +Avg ME (F77/C++) = 1.2828039385567536E-002 +Relative difference = 4.7897610623017996e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/check.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.888330e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.793097e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.793097e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.536213 sec - 7,410,239,026 cycles # 2.916 GHz - 14,339,138,310 instructions # 1.94 insn per cycle - 2.542223979 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2251) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053246266791E-002 -Relative difference = 2.5306003563303186e-07 -OK (relative difference <= 5E-3) +EvtsPerSec[Rmb+ME] (23) = ( 3.820494e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.969123e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.969123e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 +TOTAL : 2.323103 sec + 7,440,529,285 cycles:u # 3.167 GHz (74.86%) + 41,648,732 stalled-cycles-frontend:u # 0.56% frontend cycles idle (75.00%) + 1,090,351,915 stalled-cycles-backend:u # 14.65% backend cycles idle (74.98%) + 14,376,530,797 instructions:u # 1.93 insn per cycle + # 0.08 stalled cycles per insn (74.98%) + 2.353403381 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2233) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/check.exe -p 2048 256 12 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.941966e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.003945e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.003945e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.495128 sec - 7,300,359,510 cycles # 2.920 GHz - 13,954,504,737 instructions # 1.91 insn per cycle - 2.501321687 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3875) (512y: 9) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053277189611E-002 -Relative difference = 2.5547059841227576e-07 +Avg ME (F77/C++) = 1.2828053337216261E-002 +Relative difference = 2.601499261602198e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/check.exe -p 2048 256 12 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.769433e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.465872e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.465872e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 -TOTAL : 2.633617 sec - 6,283,460,391 cycles # 2.382 GHz - 13,208,445,681 instructions # 2.10 insn per cycle - 2.639761638 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1734) (512y: 3) (512z: 1266) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828052540498902E-002 -Relative difference = 1.980424851420537e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt index ea408a5346..fd4bcefacd 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt @@ -1,209 +1,168 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +HASCURAND=hasNoCurand +HASHIPRAND=hasHiprand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_f_inl1_hrd1' +CUDACPP_BUILDDIR='build.avx2_f_inl1_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-02-02_16:59:58 +DATE: 2024-02-08_18:50:16 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/gcheck.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/gcheck.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.296905e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.203816e+09 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.333243e+09 ) sec^-1 -MeanMatrixElemValue = ( 1.371687e-02 +- 3.270220e-06 ) GeV^0 -TOTAL : 0.564095 sec - 2,297,432,959 cycles # 2.909 GHz - 3,534,498,878 instructions # 1.54 insn per cycle - 0.848536295 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/gcheck.exe -p 2048 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 95 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=1, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 1.914948e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.243896e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.010655e+08 ) sec^-1 +MeanMatrixElemValue = ( 1.372027e-02 +- 3.270769e-06 ) GeV^0 +TOTAL : 0.380778 sec + 968,057,402 cycles:u # 2.420 GHz (74.18%) + 2,210,095 stalled-cycles-frontend:u # 0.23% frontend cycles idle (74.00%) + 4,956,770 stalled-cycles-backend:u # 0.51% backend cycles idle (74.85%) + 1,792,765,521 instructions:u # 1.85 insn per cycle + # 0.00 stalled cycles per insn (76.46%) + 0.426408994 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282802e-02 -Avg ME (F77/CUDA) = 1.2828112125134794E-002 -Relative difference = 7.1815552823662555e-06 +Avg ME (F77/CUDA) = 1.2828036033170065E-002 +Relative difference = 1.2498553996774023e-06 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/check.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.077580e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.911900e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.911900e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 3.386141 sec - 10,138,781,530 cycles # 2.991 GHz - 28,401,151,740 instructions # 2.80 insn per cycle - 3.392261093 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.648148e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.582942e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.582942e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371887e-02 +- 3.270267e-06 ) GeV^0 +TOTAL : 3.048682 sec + 9,955,006,358 cycles:u # 3.238 GHz (75.02%) + 39,164,420 stalled-cycles-frontend:u # 0.39% frontend cycles idle (75.02%) + 30,308,380 stalled-cycles-backend:u # 0.30% backend cycles idle (75.03%) + 28,534,668,727 instructions:u # 2.87 insn per cycle + # 0.00 stalled cycles per insn (75.03%) + 3.077460057 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 632) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039441956207E-002 -Relative difference = 4.35018750695023e-08 +Avg ME (F77/C++) = 1.2828039569285465E-002 +Relative difference = 3.357602059382168e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/check.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.009112e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.540183e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.540183e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 2.453458 sec - 7,282,809,346 cycles # 2.963 GHz - 16,786,519,808 instructions # 2.30 insn per cycle - 2.459368234 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.799299e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.213042e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.213042e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371887e-02 +- 3.270266e-06 ) GeV^0 +TOTAL : 2.329377 sec + 7,459,821,031 cycles:u # 3.166 GHz (74.88%) + 37,456,115 stalled-cycles-frontend:u # 0.50% frontend cycles idle (74.90%) + 38,987,114 stalled-cycles-backend:u # 0.52% backend cycles idle (74.90%) + 16,973,419,713 instructions:u # 2.28 insn per cycle + # 0.00 stalled cycles per insn (74.97%) + 2.360322814 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2463) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039280066150E-002 -Relative difference = 5.612189004572479e-08 +Avg ME (F77/C++) = 1.2828039385567536E-002 +Relative difference = 4.7897610623017996e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/check.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.055808e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.285205e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.285205e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.420703 sec - 7,100,946,535 cycles # 2.928 GHz - 13,729,472,446 instructions # 1.93 insn per cycle - 2.426727137 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2082) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053198973066E-002 -Relative difference = 2.4937329255889414e-07 -OK (relative difference <= 5E-3) +EvtsPerSec[Rmb+ME] (23) = ( 4.006656e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.441300e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.441300e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 +TOTAL : 2.241524 sec + 7,193,817,128 cycles:u # 3.172 GHz (75.00%) + 42,485,934 stalled-cycles-frontend:u # 0.59% frontend cycles idle (74.96%) + 351,808,028 stalled-cycles-backend:u # 4.89% backend cycles idle (74.96%) + 13,627,130,798 instructions:u # 1.89 insn per cycle + # 0.03 stalled cycles per insn (74.96%) + 2.271606398 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2064) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/check.exe -p 2048 256 12 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.087504e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.397509e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.397509e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.394356 sec - 7,028,875,611 cycles # 2.930 GHz - 13,461,006,629 instructions # 1.92 insn per cycle - 2.400705336 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3649) (512y: 12) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053198973066E-002 -Relative difference = 2.4937329255889414e-07 +Avg ME (F77/C++) = 1.2828053331759293E-002 +Relative difference = 2.597245327285885e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/check.exe -p 2048 256 12 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.841439e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.709202e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.709202e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 -TOTAL : 2.581847 sec - 6,061,187,130 cycles # 2.344 GHz - 12,911,648,801 instructions # 2.13 insn per cycle - 2.587907212 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1671) (512y: 3) (512z: 1155) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828052431359538E-002 -Relative difference = 1.895346165094282e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt index f0b403a7a3..c82d6365dd 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt @@ -1,209 +1,168 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +HASCURAND=hasNoCurand +HASHIPRAND=hasHiprand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-02-02_16:32:02 +DATE: 2024-02-08_18:18:23 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.711659e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.330223e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.162765e+08 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 0.696991 sec - 2,634,872,482 cycles # 2.816 GHz - 4,078,287,466 instructions # 1.55 insn per cycle - 1.011316469 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 166 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 5.317919e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.117489e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.346996e+07 ) sec^-1 +MeanMatrixElemValue = ( 1.371632e-02 +- 3.269165e-06 ) GeV^0 +TOTAL : 0.535140 sec + 1,268,315,502 cycles:u # 2.268 GHz (75.02%) + 2,382,364 stalled-cycles-frontend:u # 0.19% frontend cycles idle (74.52%) + 5,765,171 stalled-cycles-backend:u # 0.45% backend cycles idle (74.15%) + 2,159,570,463 instructions:u # 1.70 insn per cycle + # 0.00 stalled cycles per insn (74.51%) + 0.589109914 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282804e-02 -Avg ME (F77/CUDA) = 1.2828039901590279E-002 -Relative difference = 7.671454200650844e-09 +Avg ME (F77/CUDA) = 1.2828039901590281E-002 +Relative difference = 7.67145406542181e-09 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/check.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.759625e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.131070e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.131070e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 6.866947 sec - 19,685,481,181 cycles # 2.865 GHz - 46,978,836,921 instructions # 2.39 insn per cycle - 6.876022106 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 474) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.120301e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.279766e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.279766e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 6.407405 sec + 19,623,458,416 cycles:u # 3.048 GHz (74.97%) + 54,753,699 stalled-cycles-frontend:u # 0.28% frontend cycles idle (75.03%) + 154,428,151 stalled-cycles-backend:u # 0.79% backend cycles idle (75.03%) + 47,024,297,157 instructions:u # 2.40 insn per cycle + # 0.00 stalled cycles per insn (75.03%) + 6.441470641 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 473) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039952548879E-002 Relative difference = 3.6990156841838714e-09 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/check.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.592972e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.099500e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.099500e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.366697 sec - 12,514,683,333 cycles # 2.862 GHz - 30,923,878,603 instructions # 2.47 insn per cycle - 4.382224528 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.805955e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.283656e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.283656e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 4.269630 sec + 12,905,192,325 cycles:u # 3.000 GHz (74.93%) + 51,265,561 stalled-cycles-frontend:u # 0.40% frontend cycles idle (74.90%) + 2,181,477,361 stalled-cycles-backend:u # 16.90% backend cycles idle (74.93%) + 30,970,710,391 instructions:u # 2.40 insn per cycle + # 0.07 stalled cycles per insn (75.03%) + 4.306090956 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1667) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039952548879E-002 Relative difference = 3.6990156841838714e-09 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/check.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.897421e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.636211e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.636211e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.735735 sec - 10,227,702,915 cycles # 2.734 GHz - 19,547,572,223 instructions # 1.91 insn per cycle - 3.752605402 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2119) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.504584e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.300428e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.300428e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 3.256812 sec + 10,354,666,001 cycles:u # 3.150 GHz (74.94%) + 48,200,633 stalled-cycles-frontend:u # 0.47% frontend cycles idle (74.96%) + 901,434,633 stalled-cycles-backend:u # 8.71% backend cycles idle (74.95%) + 19,378,250,348 instructions:u # 1.87 insn per cycle + # 0.05 stalled cycles per insn (74.94%) + 3.291563108 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2101) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039951670679E-002 Relative difference = 3.767475112924841e-09 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/check.exe -p 2048 256 12 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.005313e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.852431e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.852431e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.559042 sec - 9,712,164,921 cycles # 2.725 GHz - 18,859,732,546 instructions # 1.94 insn per cycle - 3.576286985 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1850) (512y: 174) (512z: 0) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039951670679E-002 -Relative difference = 3.767475112924841e-09 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/check.exe -p 2048 256 12 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.822292e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.480978e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.480978e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.871344 sec - 8,100,287,129 cycles # 2.089 GHz - 14,814,424,737 instructions # 1.83 insn per cycle - 3.887875616 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1023) (512y: 64) (512z: 1327) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039951670679E-002 -Relative difference = 3.767475112924841e-09 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt index 1fb02e7865..2549edf04b 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt @@ -1,209 +1,168 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +HASCURAND=hasNoCurand +HASHIPRAND=hasHiprand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_m_inl0_hrd1' +CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-02-02_16:32:38 +DATE: 2024-02-08_18:18:53 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.757135e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.499496e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.135281e+08 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 0.699288 sec - 2,642,729,633 cycles # 2.818 GHz - 4,042,518,417 instructions # 1.53 insn per cycle - 1.012731835 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 154 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 5.901466e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.593663e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.920117e+07 ) sec^-1 +MeanMatrixElemValue = ( 1.371632e-02 +- 3.269165e-06 ) GeV^0 +TOTAL : 0.523632 sec + 1,231,852,545 cycles:u # 2.262 GHz (73.87%) + 2,296,434 stalled-cycles-frontend:u # 0.19% frontend cycles idle (74.76%) + 5,025,010 stalled-cycles-backend:u # 0.41% backend cycles idle (75.17%) + 2,041,125,250 instructions:u # 1.66 insn per cycle + # 0.00 stalled cycles per insn (75.36%) + 0.574251304 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282804e-02 -Avg ME (F77/CUDA) = 1.2828039901590279E-002 -Relative difference = 7.671454200650844e-09 +Avg ME (F77/CUDA) = 1.2828039901590284E-002 +Relative difference = 7.67145379496374e-09 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/check.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.042700e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.222839e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.222839e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 6.453751 sec - 18,494,474,867 cycles # 2.863 GHz - 44,591,348,128 instructions # 2.41 insn per cycle - 6.462820772 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 498) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.159622e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.334824e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.334824e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 6.233625 sec + 18,579,167,948 cycles:u # 2.966 GHz (74.97%) + 50,731,815 stalled-cycles-frontend:u # 0.27% frontend cycles idle (74.97%) + 58,751,802 stalled-cycles-backend:u # 0.32% backend cycles idle (74.98%) + 44,764,886,897 instructions:u # 2.41 insn per cycle + # 0.00 stalled cycles per insn (74.98%) + 6.266696472 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 497) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039952548879E-002 Relative difference = 3.6990156841838714e-09 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/check.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.640583e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.183791e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.183791e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.251463 sec - 12,190,129,130 cycles # 2.863 GHz - 30,217,078,040 instructions # 2.48 insn per cycle - 4.268512673 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.799612e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.291488e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.291488e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 4.306201 sec + 12,648,742,085 cycles:u # 2.916 GHz (74.87%) + 50,507,268 stalled-cycles-frontend:u # 0.40% frontend cycles idle (74.84%) + 1,829,208,873 stalled-cycles-backend:u # 14.46% backend cycles idle (74.94%) + 30,286,759,013 instructions:u # 2.39 insn per cycle + # 0.06 stalled cycles per insn (75.03%) + 4.341760557 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1650) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039952548879E-002 Relative difference = 3.6990156841838714e-09 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/check.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.923050e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.684418e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.684418e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.686216 sec - 10,215,074,750 cycles # 2.767 GHz - 19,037,008,370 instructions # 1.86 insn per cycle - 3.701764044 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2072) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.318823e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.068967e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.068967e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 3.519779 sec + 10,213,564,621 cycles:u # 2.876 GHz (75.00%) + 56,572,968 stalled-cycles-frontend:u # 0.55% frontend cycles idle (75.00%) + 289,161,532 stalled-cycles-backend:u # 2.83% backend cycles idle (75.00%) + 18,774,056,028 instructions:u # 1.84 insn per cycle + # 0.02 stalled cycles per insn (75.02%) + 3.555444739 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2054) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039951670679E-002 Relative difference = 3.767475112924841e-09 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd1/check.exe -p 2048 256 12 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.121890e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.047393e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.047393e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.368495 sec - 9,605,623,565 cycles # 2.847 GHz - 18,452,217,442 instructions # 1.92 insn per cycle - 3.384485361 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1775) (512y: 174) (512z: 0) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039951670679E-002 -Relative difference = 3.767475112924841e-09 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd1/check.exe -p 2048 256 12 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.363961e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.494536e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.494536e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.063704 sec - 7,189,299,996 cycles # 2.342 GHz - 13,242,449,549 instructions # 1.84 insn per cycle - 3.076756183 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 911) (512y: 56) (512z: 993) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039951670679E-002 -Relative difference = 3.767475112924841e-09 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt index 672f38f61c..e419a8a864 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt @@ -1,209 +1,168 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +HASCURAND=hasNoCurand +HASHIPRAND=hasHiprand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-02-02_16:33:12 +DATE: 2024-02-08_18:19:22 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.185725e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.141503e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.271658e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.532887 sec - 2,257,199,293 cycles # 2.943 GHz - 3,199,039,986 instructions # 1.42 insn per cycle - 0.842617574 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 2.337007e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.962793e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.020690e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.087161e+00 +- 3.410053e-03 ) GeV^0 +TOTAL : 0.463140 sec + 876,830,409 cycles:u # 2.041 GHz (74.46%) + 2,297,043 stalled-cycles-frontend:u # 0.26% frontend cycles idle (75.39%) + 4,959,442 stalled-cycles-backend:u # 0.57% backend cycles idle (75.77%) + 1,390,914,236 instructions:u # 1.59 insn per cycle + # 0.00 stalled cycles per insn (76.13%) + 0.518184364 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 2.028807e+00 -Avg ME (F77/CUDA) = 2.0288063388516822 -Relative difference = 3.2588034143755247e-07 +Avg ME (F77/CUDA) = 2.0288063388516817 +Relative difference = 3.258803416564443e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.054415e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.115512e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.115512e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 5.215008 sec - 14,961,228,906 cycles # 2.866 GHz - 38,722,992,457 instructions # 2.59 insn per cycle - 5.224008183 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.224994e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.282415e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.282415e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 4.909539 sec + 14,979,399,811 cycles:u # 3.032 GHz (74.92%) + 9,752,610 stalled-cycles-frontend:u # 0.07% frontend cycles idle (74.90%) + 738,781,241 stalled-cycles-backend:u # 4.93% backend cycles idle (74.95%) + 38,733,604,171 instructions:u # 2.59 insn per cycle + # 0.02 stalled cycles per insn (75.03%) + 4.943604173 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 719) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515649 Relative difference = 3.258803992249869e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.481444e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.675605e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.675605e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.125889 sec - 8,951,898,208 cycles # 2.861 GHz - 24,430,367,428 instructions # 2.73 insn per cycle - 3.138681533 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2067) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 3.991944e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.192633e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.192633e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 2.822701 sec + 8,530,609,378 cycles:u # 2.989 GHz (74.82%) + 8,934,860 stalled-cycles-frontend:u # 0.10% frontend cycles idle (74.91%) + 200,257,724 stalled-cycles-backend:u # 2.35% backend cycles idle (75.05%) + 24,381,384,726 instructions:u # 2.86 insn per cycle + # 0.01 stalled cycles per insn (75.05%) + 2.857525850 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2071) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515654 Relative difference = 3.2588039900609506e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.403552e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.873344e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.873344e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.051544 sec - 5,532,701,160 cycles # 2.689 GHz - 11,562,226,101 instructions # 2.09 insn per cycle - 2.068989985 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2396) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 6.771764e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.289617e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.289617e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 1.745992 sec + 5,163,847,173 cycles:u # 2.904 GHz (74.69%) + 7,925,713 stalled-cycles-frontend:u # 0.15% frontend cycles idle (74.62%) + 1,076,944,521 stalled-cycles-backend:u # 20.86% backend cycles idle (74.82%) + 11,575,904,777 instructions:u # 2.24 insn per cycle + # 0.09 stalled cycles per insn (75.04%) + 1.782036752 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2383) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.265641e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.903037e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.903037e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.784779 sec - 4,815,041,067 cycles # 2.689 GHz - 10,339,970,427 instructions # 2.15 insn per cycle - 1.798345856 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1972) (512y: 131) (512z: 0) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388516204 -Relative difference = 3.2588037186351226e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.954123e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.196816e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.196816e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.763217 sec - 4,948,449,645 cycles # 1.787 GHz - 7,556,267,450 instructions # 1.53 insn per cycle - 2.777246704 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1212) (512y: 65) (512z: 1543) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388516204 -Relative difference = 3.2588037186351226e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt index 31a2de1d4c..e68c61298a 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt @@ -1,222 +1,175 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +HASCURAND=hasNoCurand +HASHIPRAND=hasHiprand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-02-02_17:10:51 +DATE: 2024-02-08_19:09:09 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 2 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.485204e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.887796e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.887796e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.812556 sec - 3,100,289,953 cycles # 2.933 GHz - 4,827,993,602 instructions # 1.56 insn per cycle - 1.114474436 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 5.947613e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.787187e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.787187e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 1.237626 sec + 3,741,397,276 cycles:u # 2.935 GHz (74.96%) + 21,950,070 stalled-cycles-frontend:u # 0.59% frontend cycles idle (75.16%) + 1,158,796,162 stalled-cycles-backend:u # 30.97% backend cycles idle (75.16%) + 3,864,054,831 instructions:u # 1.03 insn per cycle + # 0.30 stalled cycles per insn (74.88%) + 1.302713629 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 2.028807e+00 -Avg ME (F77/CUDA) = 2.0288063388516822 -Relative difference = 3.2588034143755247e-07 +Avg ME (F77/CUDA) = 2.0288063388516817 +Relative difference = 3.258803416564443e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.138016e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.200803e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.200803e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 5.088592 sec - 15,313,839,146 cycles # 3.006 GHz - 38,782,932,119 instructions # 2.53 insn per cycle - 5.096133332 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.520852e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.586471e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.586471e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 4.421469 sec + 15,062,320,690 cycles:u # 3.374 GHz (74.92%) + 10,443,532 stalled-cycles-frontend:u # 0.07% frontend cycles idle (74.92%) + 743,646,927 stalled-cycles-backend:u # 4.94% backend cycles idle (74.94%) + 38,749,016,389 instructions:u # 2.57 insn per cycle + # 0.02 stalled cycles per insn (75.03%) + 4.467786878 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 719) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515649 Relative difference = 3.258803992249869e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.651731e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.851010e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.851010e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.056918 sec - 9,290,519,364 cycles # 3.033 GHz - 24,611,762,773 instructions # 2.65 insn per cycle - 3.064704949 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2067) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 4.490354e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.717831e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.717831e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 2.604527 sec + 8,669,634,857 cycles:u # 3.274 GHz (74.93%) + 9,990,683 stalled-cycles-frontend:u # 0.12% frontend cycles idle (74.93%) + 213,001,071 stalled-cycles-backend:u # 2.46% backend cycles idle (74.94%) + 24,550,642,471 instructions:u # 2.83 insn per cycle + # 0.01 stalled cycles per insn (75.09%) + 2.652317282 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2071) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515654 Relative difference = 3.2588039900609506e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.627308e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.117991e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.117991e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.050587 sec - 5,909,859,968 cycles # 2.873 GHz - 11,848,908,896 instructions # 2.00 insn per cycle - 2.058431974 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2396) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388516204 -Relative difference = 3.2588037186351226e-07 -OK (relative difference <= 5E-3) +EvtsPerSec[Rmb+ME] (23) = ( 7.596512e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.175899e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.175899e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 1.652537 sec + 5,319,982,348 cycles:u # 3.135 GHz (75.05%) + 9,374,611 stalled-cycles-frontend:u # 0.18% frontend cycles idle (75.03%) + 1,081,454,167 stalled-cycles-backend:u # 20.33% backend cycles idle (75.02%) + 11,815,370,610 instructions:u # 2.22 insn per cycle + # 0.09 stalled cycles per insn (75.02%) + 1.701362888 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2383) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.543804e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.195187e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.195187e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.788435 sec - 5,167,732,895 cycles # 2.879 GHz - 10,625,416,094 instructions # 2.06 insn per cycle - 1.795961014 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1972) (512y: 131) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.113967e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.367930e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.367930e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.739810 sec - 5,308,369,796 cycles # 1.933 GHz - 7,799,268,107 instructions # 1.47 insn per cycle - 2.747512945 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1212) (512y: 65) (512z: 1543) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388516204 -Relative difference = 3.2588037186351226e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt index a758c3bfbe..3f33133cca 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt @@ -1,209 +1,165 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +HASCURAND=hasNoCurand +HASHIPRAND=hasHiprand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-02-02_17:24:19 +DATE: 2024-02-08_19:20:37 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 2 --common OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 2 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.563084e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.152296e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.272225e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.537277e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.962263e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.016665e+07 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 0.619204 sec - 2,481,245,326 cycles # 2.921 GHz - 3,595,032,588 instructions # 1.45 insn per cycle - 0.907754121 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --common -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 1.071372 sec + 3,196,294,827 cycles:u # 2.913 GHz (75.22%) + 10,622,560 stalled-cycles-frontend:u # 0.33% frontend cycles idle (74.92%) + 1,138,534,793 stalled-cycles-backend:u # 35.62% backend cycles idle (74.92%) + 2,985,973,404 instructions:u # 0.93 insn per cycle + # 0.38 stalled cycles per insn (75.18%) + 1.123855307 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 2.028807e+00 -Avg ME (F77/CUDA) = 2.0288063388516822 -Relative difference = 3.2588034143755247e-07 +Avg ME (F77/CUDA) = 2.0288063388516817 +Relative difference = 3.258803416564443e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.136701e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.200418e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.200418e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.526171e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.593711e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.593711e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 5.076253 sec - 15,160,537,185 cycles # 2.984 GHz - 38,740,080,300 instructions # 2.56 insn per cycle - 5.082603196 seconds time elapsed +TOTAL : 4.333890 sec + 14,966,789,355 cycles:u # 3.429 GHz (74.91%) + 9,417,395 stalled-cycles-frontend:u # 0.06% frontend cycles idle (74.90%) + 819,315,354 stalled-cycles-backend:u # 5.47% backend cycles idle (74.96%) + 38,694,093,641 instructions:u # 2.59 insn per cycle + # 0.02 stalled cycles per insn (75.06%) + 4.368016125 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 719) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515649 Relative difference = 3.258803992249869e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.677020e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.881790e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.881790e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.520390e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.747995e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.747995e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 3.016913 sec - 9,133,169,341 cycles # 3.022 GHz - 24,427,912,232 instructions # 2.67 insn per cycle - 3.023499002 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2067) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 2.498026 sec + 8,520,170,360 cycles:u # 3.369 GHz (75.02%) + 9,607,172 stalled-cycles-frontend:u # 0.11% frontend cycles idle (75.02%) + 200,455,121 stalled-cycles-backend:u # 2.35% backend cycles idle (75.01%) + 24,437,311,730 instructions:u # 2.87 insn per cycle + # 0.01 stalled cycles per insn (74.88%) + 2.531681558 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2071) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515654 Relative difference = 3.2588039900609506e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.714567e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.208792e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.208792e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.690089e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.288935e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.288935e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 2.002108 sec - 5,714,978,439 cycles # 2.847 GHz - 11,544,025,075 instructions # 2.02 insn per cycle - 2.008418160 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2396) (512y: 0) (512z: 0) +TOTAL : 1.547010 sec + 5,201,654,467 cycles:u # 3.295 GHz (74.68%) + 9,324,520 stalled-cycles-frontend:u # 0.18% frontend cycles idle (74.79%) + 1,064,918,671 stalled-cycles-backend:u # 20.47% backend cycles idle (75.04%) + 11,505,267,087 instructions:u # 2.21 insn per cycle + # 0.09 stalled cycles per insn (75.17%) + 1.580873032 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2383) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.601255e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.283556e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.283556e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.757226 sec - 5,021,954,612 cycles # 2.849 GHz - 10,288,054,214 instructions # 2.05 insn per cycle - 1.763583538 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1972) (512y: 131) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388516204 -Relative difference = 3.2588037186351226e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.326508e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.602132e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.602132e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 2.593451 sec - 5,132,574,711 cycles # 1.976 GHz - 7,502,792,533 instructions # 1.46 insn per cycle - 2.599823469 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1212) (512y: 65) (512z: 1543) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388516204 -Relative difference = 3.2588037186351226e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_curhst.txt index 09fa2088b2..b2a67c7d91 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_curhst.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_curhst.txt @@ -6,8 +6,9 @@ AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' @@ -41,7 +42,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-02-02_17:20:55 +DATE: 2024-02-05_22:05:53 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +51,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.568020e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.155224e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.270184e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.574516e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.156692e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.271930e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.557941 sec - 2,317,819,031 cycles # 2.939 GHz - 3,571,672,996 instructions # 1.54 insn per cycle - 0.846156784 seconds time elapsed +TOTAL : 0.550400 sec + 2,362,670,466 cycles # 3.032 GHz + 3,700,412,120 instructions # 1.57 insn per cycle + 0.836987659 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --curhst WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 @@ -77,14 +78,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.154286e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.219615e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.219615e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.199981e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.265430e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.265430e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 4.975710 sec - 14,982,122,126 cycles # 3.009 GHz - 38,724,226,197 instructions # 2.58 insn per cycle - 4.982309696 seconds time elapsed +TOTAL : 4.873281 sec + 14,976,294,919 cycles # 3.071 GHz + 38,723,494,875 instructions # 2.59 insn per cycle + 4.880112383 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 719) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe @@ -104,14 +105,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.680555e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.886973e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.886973e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.720404e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.926714e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.926714e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.953814 sec - 8,955,704,462 cycles # 3.026 GHz - 24,429,663,092 instructions # 2.73 insn per cycle - 2.960547809 seconds time elapsed +TOTAL : 2.922753 sec + 8,955,117,292 cycles # 3.059 GHz + 24,429,239,776 instructions # 2.73 insn per cycle + 2.929117772 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2067) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe @@ -131,14 +132,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.741265e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.240782e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.240782e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.848139e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.358480e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.358480e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.932283 sec - 5,529,837,786 cycles # 2.854 GHz - 11,561,260,493 instructions # 2.09 insn per cycle - 1.938649083 seconds time elapsed +TOTAL : 1.898353 sec + 5,532,392,582 cycles # 2.906 GHz + 11,561,358,692 instructions # 2.09 insn per cycle + 1.904642376 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2396) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe @@ -158,14 +159,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.627139e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.311306e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.311306e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.810691e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.505130e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.505130e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.688646 sec - 4,821,410,673 cycles # 2.846 GHz - 10,338,456,140 instructions # 2.14 insn per cycle - 1.695233735 seconds time elapsed +TOTAL : 1.645115 sec + 4,822,133,768 cycles # 2.922 GHz + 10,338,425,342 instructions # 2.14 insn per cycle + 1.651426752 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1972) (512y: 131) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest.exe @@ -185,14 +186,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.342985e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.624658e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.624658e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.293803e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.567203e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.567203e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.521580 sec - 4,951,458,800 cycles # 1.960 GHz - 7,553,494,257 instructions # 1.53 insn per cycle - 2.527890437 seconds time elapsed +TOTAL : 2.549026 sec + 4,939,812,418 cycles # 1.934 GHz + 7,554,058,853 instructions # 1.53 insn per cycle + 2.555210030 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1212) (512y: 65) (512z: 1543) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt index 2a78bc6e18..8621d1f638 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt @@ -1,211 +1,169 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +HASCURAND=hasNoCurand +HASHIPRAND=hasHiprand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-02-02_17:17:35 +DATE: 2024-02-08_19:16:53 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 2 --rmbhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 2 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+MESDEV/none+NAVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.853265e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.153643e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.268146e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.707789 sec - 2,784,870,052 cycles # 2.929 GHz - 4,318,818,497 instructions # 1.55 insn per cycle - 1.010225269 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 6.809222e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.964400e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.018784e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 1.180941 sec + 3,644,641,887 cycles:u # 2.998 GHz (75.01%) + 21,509,076 stalled-cycles-frontend:u # 0.59% frontend cycles idle (74.99%) + 1,148,002,124 stalled-cycles-backend:u # 31.50% backend cycles idle (74.98%) + 3,857,731,060 instructions:u # 1.06 insn per cycle + # 0.30 stalled cycles per insn (74.96%) + 1.237021055 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 2.028807e+00 -Avg ME (F77/CUDA) = 2.0288063388516822 -Relative difference = 3.2588034143755247e-07 +Avg ME (F77/CUDA) = 2.0288063388516817 +Relative difference = 3.258803416564443e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.156007e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.220053e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.220053e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 4.970938 sec - 14,995,099,526 cycles # 3.014 GHz - 38,722,072,628 instructions # 2.58 insn per cycle - 4.977096590 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.502070e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.566557e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.566557e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 4.371978 sec + 15,091,805,960 cycles:u # 3.426 GHz (74.95%) + 10,010,529 stalled-cycles-frontend:u # 0.07% frontend cycles idle (74.94%) + 814,210,100 stalled-cycles-backend:u # 5.40% backend cycles idle (74.94%) + 38,744,802,078 instructions:u # 2.57 insn per cycle + # 0.02 stalled cycles per insn (74.98%) + 4.407218033 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 719) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515649 Relative difference = 3.258803992249869e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.677370e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.884329e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.884329e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.957823 sec - 8,949,231,815 cycles # 3.020 GHz - 24,428,872,352 instructions # 2.73 insn per cycle - 2.965019767 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2067) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 4.523918e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.755564e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.755564e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 2.499500 sec + 8,521,864,904 cycles:u # 3.366 GHz (74.95%) + 10,288,038 stalled-cycles-frontend:u # 0.12% frontend cycles idle (75.04%) + 199,650,176 stalled-cycles-backend:u # 2.34% backend cycles idle (75.04%) + 24,356,495,689 instructions:u # 2.86 insn per cycle + # 0.01 stalled cycles per insn (75.05%) + 2.534569681 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2071) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515654 Relative difference = 3.2588039900609506e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.602192e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.079134e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.079134e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.979040 sec - 5,538,527,993 cycles # 2.792 GHz - 11,561,582,235 instructions # 2.09 insn per cycle - 1.985442657 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2396) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 7.693357e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.287111e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.287111e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 1.545162 sec + 5,188,095,469 cycles:u # 3.288 GHz (74.70%) + 9,172,935 stalled-cycles-frontend:u # 0.18% frontend cycles idle (74.94%) + 1,065,100,619 stalled-cycles-backend:u # 20.53% backend cycles idle (75.16%) + 11,463,135,587 instructions:u # 2.21 insn per cycle + # 0.09 stalled cycles per insn (75.17%) + 1.580365960 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2383) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.633972e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.327666e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.327666e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.688145 sec - 4,813,595,906 cycles # 2.842 GHz - 10,338,321,927 instructions # 2.15 insn per cycle - 1.694491184 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1972) (512y: 131) (512z: 0) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388516204 -Relative difference = 3.2588037186351226e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.326727e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.606654e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.606654e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.531021 sec - 4,952,716,249 cycles # 1.953 GHz - 7,554,626,167 instructions # 1.53 insn per cycle - 2.537459783 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1212) (512y: 65) (512z: 1543) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388516204 -Relative difference = 3.2588037186351226e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt index a61b4fccb4..c811de5803 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt @@ -1,209 +1,168 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +HASCURAND=hasNoCurand +HASHIPRAND=hasHiprand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd1' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-02-02_16:33:41 +DATE: 2024-02-08_18:19:47 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.083953e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.139361e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.277133e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.541239 sec - 2,177,546,361 cycles # 2.795 GHz - 3,128,043,818 instructions # 1.44 insn per cycle - 0.856591915 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 208 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 2.864130e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.922591e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.975815e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.087161e+00 +- 3.410053e-03 ) GeV^0 +TOTAL : 0.403916 sec + 868,936,187 cycles:u # 2.036 GHz (73.90%) + 2,274,927 stalled-cycles-frontend:u # 0.26% frontend cycles idle (75.40%) + 5,131,368 stalled-cycles-backend:u # 0.59% backend cycles idle (75.66%) + 1,388,595,911 instructions:u # 1.60 insn per cycle + # 0.00 stalled cycles per insn (75.82%) + 0.457989081 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 2.028807e+00 -Avg ME (F77/CUDA) = 2.0288063388516822 -Relative difference = 3.2588034143755247e-07 +Avg ME (F77/CUDA) = 2.0288063388516817 +Relative difference = 3.258803416564443e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/check.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.193896e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.260442e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.260442e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 4.886037 sec - 14,688,520,316 cycles # 3.003 GHz - 39,543,826,918 instructions # 2.69 insn per cycle - 4.896017871 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.161836e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.216439e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.216439e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 5.046159 sec + 15,376,865,932 cycles:u # 3.029 GHz (74.95%) + 9,473,842 stalled-cycles-frontend:u # 0.06% frontend cycles idle (74.96%) + 20,032,982 stalled-cycles-backend:u # 0.13% backend cycles idle (74.96%) + 39,614,495,316 instructions:u # 2.58 insn per cycle + # 0.00 stalled cycles per insn (74.95%) + 5.081436215 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 596) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515649 Relative difference = 3.258803992249869e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/check.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.658350e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.874113e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.874113e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.975666 sec - 8,599,942,205 cycles # 2.884 GHz - 23,576,394,540 instructions # 2.74 insn per cycle - 2.990711914 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1948) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 3.948082e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.150404e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.150404e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 2.858126 sec + 8,747,707,706 cycles:u # 3.025 GHz (74.86%) + 10,198,563 stalled-cycles-frontend:u # 0.12% frontend cycles idle (74.83%) + 1,205,896,793 stalled-cycles-backend:u # 13.79% backend cycles idle (74.94%) + 23,497,904,722 instructions:u # 2.69 insn per cycle + # 0.05 stalled cycles per insn (75.08%) + 2.938845264 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1952) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515654 Relative difference = 3.2588039900609506e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/check.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.095675e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.498388e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.498388e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.167147 sec - 5,972,426,599 cycles # 2.749 GHz - 13,192,805,811 instructions # 2.21 insn per cycle - 2.182807028 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2560) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 6.218452e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.688137e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.688137e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 1.882617 sec + 5,712,364,152 cycles:u # 2.984 GHz (74.98%) + 8,361,013 stalled-cycles-frontend:u # 0.15% frontend cycles idle (74.93%) + 1,034,558,412 stalled-cycles-backend:u # 18.11% backend cycles idle (74.93%) + 13,212,879,207 instructions:u # 2.31 insn per cycle + # 0.08 stalled cycles per insn (74.96%) + 1.918038279 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2547) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/check.exe -p 2048 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.567908e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.057618e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.057618e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.993504 sec - 5,545,340,461 cycles # 2.774 GHz - 12,101,858,128 instructions # 2.18 insn per cycle - 2.007287045 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2030) (512y: 278) (512z: 0) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388516204 -Relative difference = 3.2588037186351226e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/check.exe -p 2048 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.892190e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.117816e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.117816e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.801774 sec - 5,370,259,466 cycles # 1.913 GHz - 9,381,238,160 instructions # 1.75 insn per cycle - 2.815070972 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1350) (512y: 88) (512z: 1989) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388516204 -Relative difference = 3.2588037186351226e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt index f86d85f93e..6e93c2f99f 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt @@ -1,209 +1,168 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +HASCURAND=hasNoCurand +HASHIPRAND=hasHiprand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_d_inl1_hrd0' +CUDACPP_BUILDDIR='build.avx2_d_inl1_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-02-02_17:00:24 +DATE: 2024-02-08_18:50:37 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/gcheck.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/gcheck.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.552871e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.155882e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.271852e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.525009 sec - 2,252,936,967 cycles # 2.935 GHz - 3,226,291,426 instructions # 1.43 insn per cycle - 0.826995753 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/gcheck.exe -p 2048 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 2.887107e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.964739e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.018898e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.087161e+00 +- 3.410053e-03 ) GeV^0 +TOTAL : 0.370653 sec + 906,243,961 cycles:u # 2.307 GHz (74.68%) + 2,241,400 stalled-cycles-frontend:u # 0.25% frontend cycles idle (76.15%) + 5,816,559 stalled-cycles-backend:u # 0.64% backend cycles idle (75.71%) + 1,412,893,858 instructions:u # 1.56 insn per cycle + # 0.00 stalled cycles per insn (75.77%) + 0.419063355 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 2.028807e+00 -Avg ME (F77/CUDA) = 2.0288063388516822 -Relative difference = 3.2588034143755247e-07 +Avg ME (F77/CUDA) = 2.0288063388516817 +Relative difference = 3.258803416564443e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/check.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.345193e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.420245e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.420245e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 4.577274 sec - 13,902,263,654 cycles # 3.034 GHz - 35,849,110,668 instructions # 2.58 insn per cycle - 4.583674578 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.820858e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.901952e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.901952e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 3.893828 sec + 13,216,640,916 cycles:u # 3.368 GHz (74.93%) + 8,991,237 stalled-cycles-frontend:u # 0.07% frontend cycles idle (74.94%) + 550,737,421 stalled-cycles-backend:u # 4.17% backend cycles idle (74.93%) + 35,871,958,786 instructions:u # 2.71 insn per cycle + # 0.02 stalled cycles per insn (74.95%) + 3.926487908 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1078) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515649 Relative difference = 3.258803992249869e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/check.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.045107e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.293604e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.293604e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.697459 sec - 8,204,528,246 cycles # 3.035 GHz - 21,906,743,123 instructions # 2.67 insn per cycle - 2.704223130 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.376057e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.595663e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.595663e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 2.573148 sec + 8,750,160,649 cycles:u # 3.361 GHz (74.86%) + 10,672,247 stalled-cycles-frontend:u # 0.12% frontend cycles idle (74.81%) + 2,345,651,364 stalled-cycles-backend:u # 26.81% backend cycles idle (74.95%) + 21,833,904,041 instructions:u # 2.50 insn per cycle + # 0.11 stalled cycles per insn (75.10%) + 2.606979230 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2334) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515654 Relative difference = 3.2588039900609506e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/check.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.540581e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.020264e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.020264e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.001671 sec - 5,533,891,457 cycles # 2.758 GHz - 12,075,756,787 instructions # 2.18 insn per cycle - 2.008182914 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3062) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 6.596704e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.037393e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.037393e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 1.766710 sec + 5,911,286,240 cycles:u # 3.289 GHz (74.98%) + 8,780,066 stalled-cycles-frontend:u # 0.15% frontend cycles idle (75.08%) + 2,229,927,098 stalled-cycles-backend:u # 37.72% backend cycles idle (75.08%) + 12,032,978,967 instructions:u # 2.04 insn per cycle + # 0.19 stalled cycles per insn (75.08%) + 1.800991140 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3046) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/check.exe -p 2048 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.262748e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.863548e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.863548e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.781168 sec - 5,117,197,454 cycles # 2.864 GHz - 11,141,274,517 instructions # 2.18 insn per cycle - 1.787609937 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2527) (512y: 224) (512z: 0) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388516204 -Relative difference = 3.2588037186351226e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/check.exe -p 2048 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.509349e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.809746e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.809746e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.432452 sec - 4,812,064,531 cycles # 1.974 GHz - 8,842,014,308 instructions # 1.84 insn per cycle - 2.438854376 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1821) (512y: 97) (512z: 2034) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388516204 -Relative difference = 3.2588037186351226e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt index a0c76606d7..e2182ea92b 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt @@ -1,209 +1,168 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +HASCURAND=hasNoCurand +HASHIPRAND=hasHiprand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_d_inl1_hrd1' +CUDACPP_BUILDDIR='build.avx2_d_inl1_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-02-02_17:00:51 +DATE: 2024-02-08_18:50:59 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/gcheck.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/gcheck.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.558196e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.156345e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.274350e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.523193 sec - 2,241,369,284 cycles # 2.943 GHz - 3,174,985,760 instructions # 1.42 insn per cycle - 0.818576914 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/gcheck.exe -p 2048 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 208 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 2.857507e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.934612e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.988214e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.087161e+00 +- 3.410053e-03 ) GeV^0 +TOTAL : 0.368713 sec + 871,490,127 cycles:u # 2.227 GHz (73.86%) + 2,160,498 stalled-cycles-frontend:u # 0.25% frontend cycles idle (75.35%) + 5,083,056 stalled-cycles-backend:u # 0.58% backend cycles idle (75.46%) + 1,381,538,673 instructions:u # 1.59 insn per cycle + # 0.00 stalled cycles per insn (75.52%) + 0.419240239 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 2.028807e+00 -Avg ME (F77/CUDA) = 2.0288063388516822 -Relative difference = 3.2588034143755247e-07 +Avg ME (F77/CUDA) = 2.0288063388516817 +Relative difference = 3.258803416564443e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/check.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.600535e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.694087e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.694087e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 4.138630 sec - 12,505,754,917 cycles # 3.019 GHz - 35,731,722,240 instructions # 2.86 insn per cycle - 4.145126972 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.190272e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.296909e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.296909e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 3.463551 sec + 11,796,590,494 cycles:u # 3.376 GHz (74.88%) + 9,800,385 stalled-cycles-frontend:u # 0.08% frontend cycles idle (74.99%) + 23,034,021 stalled-cycles-backend:u # 0.20% backend cycles idle (75.04%) + 35,636,026,969 instructions:u # 3.02 insn per cycle + # 0.00 stalled cycles per insn (75.05%) + 3.496623758 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 469) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515649 Relative difference = 3.258803992249869e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/check.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.072307e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.329834e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.329834e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.681168 sec - 8,026,405,639 cycles # 2.988 GHz - 21,260,106,738 instructions # 2.65 insn per cycle - 2.687689205 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.763409e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.020763e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.020763e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 2.377364 sec + 8,062,357,173 cycles:u # 3.349 GHz (74.79%) + 10,706,692 stalled-cycles-frontend:u # 0.13% frontend cycles idle (74.90%) + 1,739,443,425 stalled-cycles-backend:u # 21.57% backend cycles idle (75.06%) + 21,151,073,294 instructions:u # 2.62 insn per cycle + # 0.08 stalled cycles per insn (75.08%) + 2.411386800 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2088) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515654 Relative difference = 3.2588039900609506e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/check.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.852846e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.378269e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.378269e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.898868 sec - 5,310,101,299 cycles # 2.794 GHz - 11,407,590,843 instructions # 2.15 insn per cycle - 1.905391490 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2370) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 7.798115e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.412179e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.412179e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 1.521896 sec + 5,031,958,555 cycles:u # 3.241 GHz (74.85%) + 9,260,998 stalled-cycles-frontend:u # 0.18% frontend cycles idle (74.79%) + 293,419,297 stalled-cycles-backend:u # 5.83% backend cycles idle (74.75%) + 11,412,283,229 instructions:u # 2.27 insn per cycle + # 0.03 stalled cycles per insn (74.92%) + 1.556234307 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2354) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd1/check.exe -p 2048 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.398117e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.040573e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.040573e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.745493 sec - 4,984,670,896 cycles # 2.847 GHz - 10,599,547,037 instructions # 2.13 insn per cycle - 1.752010421 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1970) (512y: 162) (512z: 0) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388516204 -Relative difference = 3.2588037186351226e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd1/check.exe -p 2048 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.572456e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.879928e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.879928e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.399603 sec - 4,714,165,858 cycles # 1.961 GHz - 8,567,438,037 instructions # 1.82 insn per cycle - 2.405978635 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1392) (512y: 70) (512z: 1630) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388516204 -Relative difference = 3.2588037186351226e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt index 43d4ffde51..6e4e5c02af 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt @@ -1,209 +1,168 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +HASCURAND=hasNoCurand +HASHIPRAND=hasHiprand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-02-02_16:34:09 +DATE: 2024-02-08_18:20:11 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.533546e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.581615e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.967835e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.086718e+00 +- 3.413389e-03 ) GeV^0 -TOTAL : 0.483309 sec - 2,041,948,050 cycles # 2.874 GHz - 2,912,467,412 instructions # 1.43 insn per cycle - 0.787847132 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 128 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 9.717084e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.952544e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.117655e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.078079e+00 +- 3.394933e-03 ) GeV^0 +TOTAL : 0.353871 sec + 770,971,152 cycles:u # 2.054 GHz (73.64%) + 2,234,320 stalled-cycles-frontend:u # 0.29% frontend cycles idle (74.19%) + 4,680,600 stalled-cycles-backend:u # 0.61% backend cycles idle (75.49%) + 1,369,089,853 instructions:u # 1.78 insn per cycle + # 0.00 stalled cycles per insn (75.15%) + 0.403399827 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 2.028811e+00 -Avg ME (F77/CUDA) = 2.0288499749731272 -Relative difference = 1.9210746159747678e-05 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 2.028815e+00 +Avg ME (F77/CUDA) = 2.0288173652952537 +Relative difference = 1.1658506339321586e-06 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.300273e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.376061e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.376061e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 4.645720 sec - 13,896,395,234 cycles # 2.988 GHz - 37,078,809,595 instructions # 2.67 insn per cycle - 4.654393847 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.694112e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.777074e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.777074e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079573e+00 +- 3.404712e-03 ) GeV^0 +TOTAL : 4.050256 sec + 12,639,314,034 cycles:u # 3.100 GHz (74.91%) + 7,170,658 stalled-cycles-frontend:u # 0.06% frontend cycles idle (74.92%) + 6,335,718 stalled-cycles-backend:u # 0.05% backend cycles idle (75.01%) + 37,070,689,405 instructions:u # 2.93 insn per cycle + # 0.00 stalled cycles per insn (75.09%) + 4.081116452 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 578) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288197983754799 -Relative difference = 9.938019153537065e-08 +Avg ME (F77/C++) = 2.0288198367925361 +Relative difference = 8.044452636897417e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.331058e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.794073e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.794073e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 -TOTAL : 2.053561 sec - 6,160,962,018 cycles # 2.993 GHz - 15,211,875,736 instructions # 2.47 insn per cycle - 2.070718349 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2459) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 5.510638e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.885491e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.885491e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079573e+00 +- 3.404713e-03 ) GeV^0 +TOTAL : 2.060524 sec + 6,352,569,577 cycles:u # 3.042 GHz (75.10%) + 7,069,743 stalled-cycles-frontend:u # 0.11% frontend cycles idle (75.10%) + 2,199,024,394 stalled-cycles-backend:u # 34.62% backend cycles idle (75.11%) + 15,190,984,241 instructions:u # 2.39 insn per cycle + # 0.14 stalled cycles per insn (75.11%) + 2.092123124 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2463) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028819e+00 -Avg ME (F77/C++) = 2.0288191968575120 -Relative difference = 9.703059369476286e-08 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028820e+00 +Avg ME (F77/C++) = 2.0288198773050681 +Relative difference = 6.047600673895608e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.320822e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.072262e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.072262e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414230e-03 ) GeV^0 -TOTAL : 1.211627 sec - 3,445,855,702 cycles # 2.832 GHz - 7,715,341,435 instructions # 2.24 insn per cycle - 1.224231764 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3071) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288179996423423 -Relative difference = 1.7628858734720142e-10 -OK (relative difference <= 5E-3) +EvtsPerSec[Rmb+ME] (23) = ( 1.110080e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.257453e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.257453e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.079551e+00 +- 3.404208e-03 ) GeV^0 +TOTAL : 1.099872 sec + 3,366,039,321 cycles:u # 2.985 GHz (74.70%) + 7,543,952 stalled-cycles-frontend:u # 0.22% frontend cycles idle (75.05%) + 922,135,812 stalled-cycles-backend:u # 27.40% backend cycles idle (75.18%) + 7,686,666,680 instructions:u # 2.28 insn per cycle + # 0.12 stalled cycles per insn (75.18%) + 1.131211150 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3055) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe -p 2048 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.991582e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.166278e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.166278e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414230e-03 ) GeV^0 -TOTAL : 1.136939 sec - 3,174,771,668 cycles # 2.778 GHz - 7,109,989,939 instructions # 2.24 insn per cycle - 1.164211542 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2733) (512y: 13) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288179996423423 -Relative difference = 1.7628858734720142e-10 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028819e+00 +Avg ME (F77/C++) = 2.0288186294492334 +Relative difference = 1.826435805832187e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe -p 2048 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.146954e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.959760e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.959760e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.559884 sec - 2,985,663,220 cycles # 1.909 GHz - 5,764,782,366 instructions # 1.93 insn per cycle - 1.574614461 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2088) (512y: 20) (512z: 1914) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288183195516467 -Relative difference = 1.5750631496822894e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt index 98f5c2b819..98175f490e 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt @@ -1,222 +1,175 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +HASCURAND=hasNoCurand +HASHIPRAND=hasHiprand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-02-02_17:11:19 +DATE: 2024-02-08_19:09:32 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 2 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.024281e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.434380e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.434380e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.086805e+00 +- 3.414078e-03 ) GeV^0 -TOTAL : 0.671435 sec - 2,677,853,529 cycles # 2.938 GHz - 4,121,864,806 instructions # 1.54 insn per cycle - 0.970344829 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -==PROF== Profiling "sigmaKin": launch__registers_per_thread 128 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 7.469110e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.066051e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.066051e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.079682e+00 +- 3.408341e-03 ) GeV^0 +TOTAL : 1.161093 sec + 3,534,619,605 cycles:u # 2.971 GHz (75.18%) + 21,007,733 stalled-cycles-frontend:u # 0.59% frontend cycles idle (75.14%) + 1,154,175,574 stalled-cycles-backend:u # 32.65% backend cycles idle (75.07%) + 3,864,038,545 instructions:u # 1.09 insn per cycle + # 0.30 stalled cycles per insn (74.88%) + 1.219336608 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 2.028811e+00 -Avg ME (F77/CUDA) = 2.0288499749731272 -Relative difference = 1.9210746159747678e-05 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 2.028815e+00 +Avg ME (F77/CUDA) = 2.0288173652952537 +Relative difference = 1.1658506339321586e-06 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.324631e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.401347e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.401347e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 4.639106 sec - 14,075,045,585 cycles # 3.030 GHz - 37,121,512,699 instructions # 2.64 insn per cycle - 4.646326776 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.989598e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.079878e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.079878e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079573e+00 +- 3.404712e-03 ) GeV^0 +TOTAL : 3.693726 sec + 12,717,760,292 cycles:u # 3.411 GHz (74.92%) + 7,367,503 stalled-cycles-frontend:u # 0.06% frontend cycles idle (74.90%) + 19,690,684 stalled-cycles-backend:u # 0.15% backend cycles idle (74.90%) + 37,107,308,562 instructions:u # 2.92 insn per cycle + # 0.00 stalled cycles per insn (74.96%) + 3.730652000 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 578) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288197983754799 -Relative difference = 9.938019153537065e-08 +Avg ME (F77/C++) = 2.0288198367925361 +Relative difference = 8.044452636897417e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.164279e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.609041e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.609041e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 -TOTAL : 2.165916 sec - 6,361,590,953 cycles # 2.929 GHz - 15,492,231,939 instructions # 2.44 insn per cycle - 2.173519132 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2459) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 6.079130e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.473563e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.473563e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079573e+00 +- 3.404713e-03 ) GeV^0 +TOTAL : 1.917660 sec + 6,440,768,919 cycles:u # 3.300 GHz (75.00%) + 7,702,194 stalled-cycles-frontend:u # 0.12% frontend cycles idle (75.00%) + 2,203,771,040 stalled-cycles-backend:u # 34.22% backend cycles idle (75.03%) + 15,518,583,539 instructions:u # 2.41 insn per cycle + # 0.14 stalled cycles per insn (75.03%) + 1.955037082 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2463) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028819e+00 -Avg ME (F77/C++) = 2.0288191968575120 -Relative difference = 9.703059369476286e-08 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028820e+00 +Avg ME (F77/C++) = 2.0288198773050681 +Relative difference = 6.047600673895608e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.218302e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.056192e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.056192e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414230e-03 ) GeV^0 -TOTAL : 1.269121 sec - 3,643,049,532 cycles # 2.857 GHz - 7,953,337,878 instructions # 2.18 insn per cycle - 1.276265031 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3071) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.210191e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.364051e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.364051e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.079551e+00 +- 3.404208e-03 ) GeV^0 +TOTAL : 1.056488 sec + 3,430,795,268 cycles:u # 3.146 GHz (74.97%) + 7,447,191 stalled-cycles-frontend:u # 0.22% frontend cycles idle (75.06%) + 937,957,545 stalled-cycles-backend:u # 27.34% backend cycles idle (75.06%) + 7,890,493,966 instructions:u # 2.30 insn per cycle + # 0.12 stalled cycles per insn (75.12%) + 1.093907892 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3055) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288179996423423 -Relative difference = 1.7628858734720142e-10 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.012921e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.180259e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.180259e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414230e-03 ) GeV^0 -TOTAL : 1.166061 sec - 3,369,726,917 cycles # 2.875 GHz - 7,347,231,326 instructions # 2.18 insn per cycle - 1.173163960 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2733) (512y: 13) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288179996423423 -Relative difference = 1.7628858734720142e-10 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028819e+00 +Avg ME (F77/C++) = 2.0288186294492334 +Relative difference = 1.826435805832187e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.480120e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.338126e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.338126e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.534240 sec - 3,185,143,707 cycles # 2.067 GHz - 6,021,106,710 instructions # 1.89 insn per cycle - 1.541619608 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2088) (512y: 20) (512z: 1914) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288183195516467 -Relative difference = 1.5750631496822894e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt index 8018096c94..c402bf0e6a 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt @@ -1,209 +1,165 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +HASCURAND=hasNoCurand +HASHIPRAND=hasHiprand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-02-02_17:24:47 +DATE: 2024-02-08_19:20:59 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 2 --common OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 2 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.410759e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.641724e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.958540e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.079446e+00 +- 3.403306e-03 ) GeV^0 -TOTAL : 0.563537 sec - 2,302,158,813 cycles # 2.935 GHz - 3,379,864,652 instructions # 1.47 insn per cycle - 0.842287675 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --common -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 128 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 7.128644e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.957374e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.121793e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.080169e+00 +- 3.463853e-03 ) GeV^0 +TOTAL : 1.014163 sec + 3,102,163,524 cycles:u # 2.988 GHz (74.63%) + 10,658,601 stalled-cycles-frontend:u # 0.34% frontend cycles idle (74.97%) + 1,156,816,869 stalled-cycles-backend:u # 37.29% backend cycles idle (75.35%) + 2,768,693,006 instructions:u # 0.89 insn per cycle + # 0.42 stalled cycles per insn (75.39%) + 1.061213020 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 2.028811e+00 -Avg ME (F77/CUDA) = 2.0288499749731272 -Relative difference = 1.9210746159747678e-05 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 2.028815e+00 +Avg ME (F77/CUDA) = 2.0288173652952537 +Relative difference = 1.1658506339321586e-06 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.333228e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.409970e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.409970e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079572e+00 +- 3.404712e-03 ) GeV^0 -TOTAL : 4.633574 sec - 14,062,863,775 cycles # 3.032 GHz - 37,107,530,540 instructions # 2.64 insn per cycle - 4.639726695 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.995911e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.086114e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.086114e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079573e+00 +- 3.404712e-03 ) GeV^0 +TOTAL : 3.642503 sec + 12,638,106,769 cycles:u # 3.445 GHz (74.93%) + 7,311,831 stalled-cycles-frontend:u # 0.06% frontend cycles idle (74.93%) + 14,963,186 stalled-cycles-backend:u # 0.12% backend cycles idle (74.94%) + 37,156,096,327 instructions:u # 2.94 insn per cycle + # 0.00 stalled cycles per insn (74.95%) + 3.670771442 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 578) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288197983754799 -Relative difference = 9.938019153537065e-08 +Avg ME (F77/C++) = 2.0288198367925361 +Relative difference = 8.044452636897417e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.234249e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.670707e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.670707e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079572e+00 +- 3.404711e-03 ) GeV^0 -TOTAL : 2.142800 sec - 6,324,946,525 cycles # 2.945 GHz - 15,223,847,892 instructions # 2.41 insn per cycle - 2.149008605 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2459) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 6.314296e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.749749e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.749749e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079573e+00 +- 3.404713e-03 ) GeV^0 +TOTAL : 1.802842 sec + 6,199,351,076 cycles:u # 3.389 GHz (74.81%) + 6,748,605 stalled-cycles-frontend:u # 0.11% frontend cycles idle (75.03%) + 2,101,796,505 stalled-cycles-backend:u # 33.90% backend cycles idle (75.07%) + 15,211,927,423 instructions:u # 2.45 insn per cycle + # 0.14 stalled cycles per insn (75.08%) + 1.830971384 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2463) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028819e+00 -Avg ME (F77/C++) = 2.0288191968575120 -Relative difference = 9.703059369476286e-08 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028820e+00 +Avg ME (F77/C++) = 2.0288198773050681 +Relative difference = 6.047600673895608e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.940291e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.027428e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.027428e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.079550e+00 +- 3.404207e-03 ) GeV^0 -TOTAL : 1.319612 sec - 3,605,863,807 cycles # 2.722 GHz - 7,699,762,069 instructions # 2.14 insn per cycle - 1.326157663 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3071) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288179996423423 -Relative difference = 1.7628858734720142e-10 -OK (relative difference <= 5E-3) +EvtsPerSec[Rmb+ME] (23) = ( 1.221195e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.377891e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.377891e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.079551e+00 +- 3.404208e-03 ) GeV^0 +TOTAL : 1.001392 sec + 3,363,118,873 cycles:u # 3.273 GHz (74.97%) + 7,454,848 stalled-cycles-frontend:u # 0.22% frontend cycles idle (75.09%) + 918,947,185 stalled-cycles-backend:u # 27.32% backend cycles idle (75.09%) + 7,657,243,210 instructions:u # 2.28 insn per cycle + # 0.12 stalled cycles per insn (75.10%) + 1.029676638 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3055) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.022976e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.198830e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.198830e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.079550e+00 +- 3.404207e-03 ) GeV^0 -TOTAL : 1.166017 sec - 3,348,738,569 cycles # 2.860 GHz - 7,059,534,247 instructions # 2.11 insn per cycle - 1.172015291 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2733) (512y: 13) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288179996423423 -Relative difference = 1.7628858734720142e-10 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028819e+00 +Avg ME (F77/C++) = 2.0288186294492334 +Relative difference = 1.826435805832187e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.610819e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.498385e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.498385e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079550e+00 +- 3.404208e-03 ) GeV^0 -TOTAL : 1.520084 sec - 3,146,140,809 cycles # 2.063 GHz - 5,713,379,089 instructions # 1.82 insn per cycle - 1.526188235 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2088) (512y: 20) (512z: 1914) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288183195516467 -Relative difference = 1.5750631496822894e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_curhst.txt index 5e6223e60a..97dabf5ef5 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_curhst.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_curhst.txt @@ -6,8 +6,9 @@ AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' @@ -41,7 +42,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-02-02_17:21:22 +DATE: 2024-02-05_22:06:20 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +51,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.431874e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.641947e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.969075e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.471523e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.653169e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.969998e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086718e+00 +- 3.413389e-03 ) GeV^0 -TOTAL : 0.510401 sec - 2,139,390,801 cycles # 2.926 GHz - 3,345,521,761 instructions # 1.56 insn per cycle - 0.788934557 seconds time elapsed +TOTAL : 0.507312 sec + 2,173,619,290 cycles # 2.997 GHz + 3,399,295,191 instructions # 1.56 insn per cycle + 0.785350773 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --curhst WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 128 @@ -77,14 +78,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.334351e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.411221e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.411221e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.381142e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.460290e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.460290e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 4.577211 sec - 13,894,490,599 cycles # 3.032 GHz - 37,077,812,399 instructions # 2.67 insn per cycle - 4.583588734 seconds time elapsed +TOTAL : 4.486749 sec + 13,885,368,304 cycles # 3.091 GHz + 37,077,531,970 instructions # 2.67 insn per cycle + 4.492745730 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 578) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe @@ -104,14 +105,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.298766e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.752115e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.752115e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.472143e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.958884e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.958884e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 -TOTAL : 2.065378 sec - 6,157,955,875 cycles # 2.974 GHz - 15,211,152,689 instructions # 2.47 insn per cycle - 2.071339807 seconds time elapsed +TOTAL : 2.001306 sec + 6,161,638,049 cycles # 3.071 GHz + 15,211,574,228 instructions # 2.47 insn per cycle + 2.007539907 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2459) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe @@ -131,14 +132,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.417100e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.084010e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.084010e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.708734e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.120132e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.120132e+06 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414230e-03 ) GeV^0 -TOTAL : 1.198941 sec - 3,436,953,265 cycles # 2.855 GHz - 7,714,718,173 instructions # 2.24 insn per cycle - 1.204962695 seconds time elapsed +TOTAL : 1.163460 sec + 3,453,582,210 cycles # 2.956 GHz + 7,714,676,572 instructions # 2.23 insn per cycle + 1.169489643 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3071) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe @@ -158,14 +159,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.028737e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.201824e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.201824e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.023571e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.194006e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.194006e+06 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414230e-03 ) GeV^0 -TOTAL : 1.105323 sec - 3,171,632,812 cycles # 2.856 GHz - 7,108,663,806 instructions # 2.24 insn per cycle - 1.111563530 seconds time elapsed +TOTAL : 1.108590 sec + 3,177,081,334 cycles # 2.853 GHz + 7,108,573,900 instructions # 2.24 insn per cycle + 1.114719814 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2733) (512y: 13) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest.exe @@ -185,14 +186,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.562160e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.432151e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.432151e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.915759e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.846765e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.846765e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.472839 sec - 2,980,761,794 cycles # 2.017 GHz - 5,762,551,506 instructions # 1.93 insn per cycle - 1.478885152 seconds time elapsed +TOTAL : 1.408180 sec + 2,979,297,592 cycles # 2.108 GHz + 5,762,569,288 instructions # 1.93 insn per cycle + 1.414266379 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2088) (512y: 20) (512z: 1914) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt index 17bbbcdc18..7e24e0cb32 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt @@ -1,211 +1,169 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +HASCURAND=hasNoCurand +HASHIPRAND=hasHiprand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-02-02_17:18:02 +DATE: 2024-02-08_19:17:16 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 2 --rmbhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 2 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+MESDEV/none+NAVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.767953e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.639786e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.970947e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.086805e+00 +- 3.414078e-03 ) GeV^0 -TOTAL : 0.615043 sec - 2,455,673,544 cycles # 2.939 GHz - 3,814,343,927 instructions # 1.55 insn per cycle - 0.893079123 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -==PROF== Profiling "sigmaKin": launch__registers_per_thread 128 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 8.203851e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.926479e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.088864e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.079682e+00 +- 3.408341e-03 ) GeV^0 +TOTAL : 1.148947 sec + 3,515,379,084 cycles:u # 3.036 GHz (75.11%) + 22,072,547 stalled-cycles-frontend:u # 0.63% frontend cycles idle (75.20%) + 1,149,905,913 stalled-cycles-backend:u # 32.71% backend cycles idle (75.19%) + 3,771,155,190 instructions:u # 1.07 insn per cycle + # 0.30 stalled cycles per insn (75.19%) + 1.197866574 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 2.028811e+00 -Avg ME (F77/CUDA) = 2.0288499749731272 -Relative difference = 1.9210746159747678e-05 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 2.028815e+00 +Avg ME (F77/CUDA) = 2.0288173652952537 +Relative difference = 1.1658506339321586e-06 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.327161e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.404427e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.404427e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 4.591617 sec - 13,900,476,915 cycles # 3.024 GHz - 37,078,921,215 instructions # 2.67 insn per cycle - 4.597647923 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.990218e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.080220e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.080220e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079573e+00 +- 3.404712e-03 ) GeV^0 +TOTAL : 3.650953 sec + 12,641,654,314 cycles:u # 3.436 GHz (74.99%) + 7,078,341 stalled-cycles-frontend:u # 0.06% frontend cycles idle (74.99%) + 11,932,077 stalled-cycles-backend:u # 0.09% backend cycles idle (74.99%) + 37,053,528,887 instructions:u # 2.93 insn per cycle + # 0.00 stalled cycles per insn (75.01%) + 3.680952514 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 578) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288197983754799 -Relative difference = 9.938019153537065e-08 +Avg ME (F77/C++) = 2.0288198367925361 +Relative difference = 8.044452636897417e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.368625e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.834956e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.834956e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 -TOTAL : 2.038104 sec - 6,160,516,772 cycles # 3.015 GHz - 15,211,067,224 instructions # 2.47 insn per cycle - 2.044347805 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2459) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 6.112952e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.515228e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.515228e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079573e+00 +- 3.404713e-03 ) GeV^0 +TOTAL : 1.860685 sec + 6,366,354,024 cycles:u # 3.371 GHz (75.01%) + 6,927,995 stalled-cycles-frontend:u # 0.11% frontend cycles idle (75.01%) + 2,205,196,706 stalled-cycles-backend:u # 34.64% backend cycles idle (75.01%) + 15,202,556,643 instructions:u # 2.39 insn per cycle + # 0.15 stalled cycles per insn (75.01%) + 1.890454940 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2463) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028819e+00 -Avg ME (F77/C++) = 2.0288191968575120 -Relative difference = 9.703059369476286e-08 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028820e+00 +Avg ME (F77/C++) = 2.0288198773050681 +Relative difference = 6.047600673895608e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.404670e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.084087e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.084087e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414230e-03 ) GeV^0 -TOTAL : 1.200768 sec - 3,447,709,713 cycles # 2.860 GHz - 7,715,262,327 instructions # 2.24 insn per cycle - 1.206803987 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3071) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288179996423423 -Relative difference = 1.7628858734720142e-10 -OK (relative difference <= 5E-3) +EvtsPerSec[Rmb+ME] (23) = ( 1.224299e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.381818e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.381818e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.079551e+00 +- 3.404208e-03 ) GeV^0 +TOTAL : 1.000758 sec + 3,385,540,430 cycles:u # 3.291 GHz (74.59%) + 7,948,412 stalled-cycles-frontend:u # 0.23% frontend cycles idle (74.98%) + 921,498,919 stalled-cycles-backend:u # 27.22% backend cycles idle (75.12%) + 7,650,419,045 instructions:u # 2.26 insn per cycle + # 0.12 stalled cycles per insn (75.12%) + 1.030474482 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3055) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.024218e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.196064e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.196064e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414230e-03 ) GeV^0 -TOTAL : 1.110275 sec - 3,170,489,061 cycles # 2.843 GHz - 7,108,656,549 instructions # 2.24 insn per cycle - 1.116145524 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2733) (512y: 13) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288179996423423 -Relative difference = 1.7628858734720142e-10 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028819e+00 +Avg ME (F77/C++) = 2.0288186294492334 +Relative difference = 1.826435805832187e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.463546e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.319498e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.319498e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.491524 sec - 2,980,281,199 cycles # 1.991 GHz - 5,762,695,736 instructions # 1.93 insn per cycle - 1.497724740 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2088) (512y: 20) (512z: 1914) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288183195516467 -Relative difference = 1.5750631496822894e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt index be4b357efb..55e5f17e9c 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt @@ -1,209 +1,168 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +HASCURAND=hasNoCurand +HASHIPRAND=hasHiprand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd1' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-02-02_16:34:32 +DATE: 2024-02-08_18:20:33 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.629446e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.680893e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.034351e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.086718e+00 +- 3.413389e-03 ) GeV^0 -TOTAL : 0.486959 sec - 2,018,816,716 cycles # 2.825 GHz - 2,879,661,485 instructions # 1.43 insn per cycle - 0.787632532 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 127 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 9.869687e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.117495e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.301836e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.078079e+00 +- 3.394933e-03 ) GeV^0 +TOTAL : 0.329222 sec + 758,075,333 cycles:u # 2.173 GHz (73.98%) + 2,198,225 stalled-cycles-frontend:u # 0.29% frontend cycles idle (74.40%) + 5,131,410 stalled-cycles-backend:u # 0.68% backend cycles idle (75.50%) + 1,351,085,970 instructions:u # 1.78 insn per cycle + # 0.00 stalled cycles per insn (75.69%) + 0.378253825 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 2.028811e+00 -Avg ME (F77/CUDA) = 2.0288499749731272 -Relative difference = 1.9210746159747678e-05 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 2.028815e+00 +Avg ME (F77/CUDA) = 2.0288173652952537 +Relative difference = 1.1658506339321586e-06 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/check.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.318040e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.395392e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.395392e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 4.610456 sec - 13,808,077,032 cycles # 2.992 GHz - 37,480,687,446 instructions # 2.71 insn per cycle - 4.619234198 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.683405e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.765766e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.765766e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079573e+00 +- 3.404712e-03 ) GeV^0 +TOTAL : 4.064858 sec + 12,705,974,677 cycles:u # 3.105 GHz (74.98%) + 7,225,703 stalled-cycles-frontend:u # 0.06% frontend cycles idle (74.98%) + 9,464,671 stalled-cycles-backend:u # 0.07% backend cycles idle (74.98%) + 37,493,096,667 instructions:u # 2.95 insn per cycle + # 0.00 stalled cycles per insn (74.99%) + 4.094241095 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 503) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288197983754799 -Relative difference = 9.938019153537065e-08 +Avg ME (F77/C++) = 2.0288198367925361 +Relative difference = 8.044452636897417e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/check.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.994423e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.589805e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.589805e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 -TOTAL : 1.834502 sec - 5,470,617,338 cycles # 2.973 GHz - 15,244,617,289 instructions # 2.79 insn per cycle - 1.847488005 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2330) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 6.637462e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.178307e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.178307e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079573e+00 +- 3.404713e-03 ) GeV^0 +TOTAL : 1.736673 sec + 5,359,824,438 cycles:u # 3.038 GHz (74.86%) + 7,182,035 stalled-cycles-frontend:u # 0.13% frontend cycles idle (75.05%) + 1,296,947,856 stalled-cycles-backend:u # 24.20% backend cycles idle (75.07%) + 15,243,397,165 instructions:u # 2.84 insn per cycle + # 0.09 stalled cycles per insn (75.07%) + 1.768220444 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2334) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028819e+00 -Avg ME (F77/C++) = 2.0288191968575120 -Relative difference = 9.703059369476286e-08 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028820e+00 +Avg ME (F77/C++) = 2.0288198773050681 +Relative difference = 6.047600673895608e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/check.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.408507e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.071601e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.071601e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414230e-03 ) GeV^0 -TOTAL : 1.724801 sec - 4,722,620,558 cycles # 2.729 GHz - 9,849,917,191 instructions # 2.09 insn per cycle - 1.737326705 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3750) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288180243223906 -Relative difference = 1.1988453753912676e-08 -OK (relative difference <= 5E-3) +EvtsPerSec[Rmb+ME] (23) = ( 7.917228e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.592228e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.592228e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079551e+00 +- 3.404208e-03 ) GeV^0 +TOTAL : 1.474582 sec + 4,473,730,027 cycles:u # 2.977 GHz (75.00%) + 7,117,732 stalled-cycles-frontend:u # 0.16% frontend cycles idle (74.98%) + 1,664,089,796 stalled-cycles-backend:u # 37.20% backend cycles idle (74.98%) + 9,819,840,116 instructions:u # 2.20 insn per cycle + # 0.17 stalled cycles per insn (74.98%) + 1.506565113 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3734) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/check.exe -p 2048 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.861072e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.615771e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.615771e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414230e-03 ) GeV^0 -TOTAL : 1.615802 sec - 4,489,859,292 cycles # 2.769 GHz - 9,201,864,074 instructions # 2.05 insn per cycle - 1.629359197 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3384) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288180243223906 -Relative difference = 1.1988453753912676e-08 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028819e+00 +Avg ME (F77/C++) = 2.0288186428369954 +Relative difference = 1.7604478492421832e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/check.exe -p 2048 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.291197e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.890714e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.890714e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.754280 sec - 3,451,820,596 cycles # 1.961 GHz - 6,874,597,071 instructions # 1.99 insn per cycle - 1.768591490 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2257) (512y: 8) (512z: 2261) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288183217635378 -Relative difference = 1.5859655131013432e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt index 60adea2b86..d8fd9d737f 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt @@ -1,209 +1,168 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +HASCURAND=hasNoCurand +HASHIPRAND=hasHiprand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_f_inl1_hrd0' +CUDACPP_BUILDDIR='build.avx2_f_inl1_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-02-02_17:01:17 +DATE: 2024-02-08_18:51:20 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/gcheck.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/gcheck.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.419681e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.633829e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.958841e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.086718e+00 +- 3.413389e-03 ) GeV^0 -TOTAL : 0.481474 sec - 2,050,050,331 cycles # 2.906 GHz - 2,917,103,980 instructions # 1.42 insn per cycle - 0.764514667 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/gcheck.exe -p 2048 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 128 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 1.006988e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.955973e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.119970e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.078079e+00 +- 3.394933e-03 ) GeV^0 +TOTAL : 0.319267 sec + 752,328,917 cycles:u # 2.225 GHz (75.00%) + 2,255,449 stalled-cycles-frontend:u # 0.30% frontend cycles idle (74.19%) + 4,913,341 stalled-cycles-backend:u # 0.65% backend cycles idle (75.23%) + 1,335,747,468 instructions:u # 1.78 insn per cycle + # 0.00 stalled cycles per insn (77.06%) + 0.366668404 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 2.028811e+00 -Avg ME (F77/CUDA) = 2.0288499749731272 -Relative difference = 1.9210746159747678e-05 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 2.028815e+00 +Avg ME (F77/CUDA) = 2.0288173652952537 +Relative difference = 1.1658506339321586e-06 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/check.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.459780e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.548532e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.548532e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 4.353055 sec - 12,412,273,179 cycles # 2.849 GHz - 34,218,645,680 instructions # 2.76 insn per cycle - 4.360276449 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.194534e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.299073e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.299073e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079573e+00 +- 3.404712e-03 ) GeV^0 +TOTAL : 3.425079 sec + 11,759,964,306 cycles:u # 3.407 GHz (74.97%) + 7,329,194 stalled-cycles-frontend:u # 0.06% frontend cycles idle (74.97%) + 1,664,701,710 stalled-cycles-backend:u # 14.16% backend cycles idle (74.97%) + 34,201,803,267 instructions:u # 2.91 insn per cycle + # 0.05 stalled cycles per insn (74.99%) + 3.454461902 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 768) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 Avg ME (F77/C++) = 2.0288199088536203 Relative difference = 4.4925808981097166e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/check.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.219620e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.851279e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.851279e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 -TOTAL : 1.771235 sec - 5,357,519,004 cycles # 3.016 GHz - 14,587,191,325 instructions # 2.72 insn per cycle - 1.777278889 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 7.187168e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.752438e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.752438e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079573e+00 +- 3.404713e-03 ) GeV^0 +TOTAL : 1.600326 sec + 5,421,260,654 cycles:u # 3.332 GHz (74.93%) + 7,370,627 stalled-cycles-frontend:u # 0.14% frontend cycles idle (74.93%) + 2,014,655,554 stalled-cycles-backend:u # 37.16% backend cycles idle (74.72%) + 14,670,168,796 instructions:u # 2.71 insn per cycle + # 0.14 stalled cycles per insn (74.77%) + 1.630912842 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2947) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028819e+00 -Avg ME (F77/C++) = 2.0288192580919713 -Relative difference = 1.2721291123071246e-07 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028820e+00 +Avg ME (F77/C++) = 2.0288198769558221 +Relative difference = 6.06481491495597e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/check.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.855390e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.823038e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.823038e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414230e-03 ) GeV^0 -TOTAL : 1.420079 sec - 4,057,817,688 cycles # 2.847 GHz - 9,088,308,136 instructions # 2.24 insn per cycle - 1.426130725 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4501) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288180499337614 -Relative difference = 2.4612242975974814e-08 -OK (relative difference <= 5E-3) +EvtsPerSec[Rmb+ME] (23) = ( 9.366961e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.026906e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.026906e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.079551e+00 +- 3.404208e-03 ) GeV^0 +TOTAL : 1.268598 sec + 4,255,477,612 cycles:u # 3.285 GHz (74.73%) + 8,186,065 stalled-cycles-frontend:u # 0.19% frontend cycles idle (74.73%) + 1,645,922,864 stalled-cycles-backend:u # 38.68% backend cycles idle (74.90%) + 9,083,647,172 instructions:u # 2.13 insn per cycle + # 0.18 stalled cycles per insn (75.21%) + 1.299091238 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4485) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/check.exe -p 2048 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.422692e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.553877e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.553877e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414230e-03 ) GeV^0 -TOTAL : 1.330184 sec - 3,800,576,658 cycles # 2.846 GHz - 8,440,632,134 instructions # 2.22 insn per cycle - 1.336236197 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4043) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288180499337614 -Relative difference = 2.4612242975974814e-08 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028819e+00 +Avg ME (F77/C++) = 2.0288186752004549 +Relative difference = 1.6009291367898262e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/check.exe -p 2048 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.840580e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.353187e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.353187e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.880607 sec - 3,726,563,519 cycles # 1.976 GHz - 7,571,520,704 instructions # 2.03 insn per cycle - 1.886725416 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3646) (512y: 1) (512z: 2853) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/runTest.exe -[ PASSED ] 6 tests. +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288183350348845 -Relative difference = 1.6513796936156652e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt index afef6ac1df..112654eea6 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt @@ -1,209 +1,168 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +HASCURAND=hasNoCurand +HASHIPRAND=hasHiprand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_f_inl1_hrd1' +CUDACPP_BUILDDIR='build.avx2_f_inl1_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-02-02_17:01:40 +DATE: 2024-02-08_18:51:40 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/gcheck.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/gcheck.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.486059e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.682847e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.021674e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.086718e+00 +- 3.413389e-03 ) GeV^0 -TOTAL : 0.479591 sec - 2,059,980,015 cycles # 2.926 GHz - 2,913,387,557 instructions # 1.41 insn per cycle - 0.761767944 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/gcheck.exe -p 2048 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 127 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 9.551366e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.113232e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.297209e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.078079e+00 +- 3.394933e-03 ) GeV^0 +TOTAL : 0.320151 sec + 759,031,987 cycles:u # 2.225 GHz (74.40%) + 2,153,774 stalled-cycles-frontend:u # 0.28% frontend cycles idle (76.10%) + 4,939,796 stalled-cycles-backend:u # 0.65% backend cycles idle (76.08%) + 1,297,465,264 instructions:u # 1.71 insn per cycle + # 0.00 stalled cycles per insn (74.19%) + 0.370145172 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 2.028811e+00 -Avg ME (F77/CUDA) = 2.0288499749731272 -Relative difference = 1.9210746159747678e-05 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 2.028815e+00 +Avg ME (F77/CUDA) = 2.0288173652952537 +Relative difference = 1.1658506339321586e-06 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/check.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.607689e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.704935e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.704935e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 4.106422 sec - 11,947,158,125 cycles # 2.906 GHz - 35,406,900,683 instructions # 2.96 insn per cycle - 4.112604276 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.410323e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.529706e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.529706e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079573e+00 +- 3.404712e-03 ) GeV^0 +TOTAL : 3.217345 sec + 10,905,311,823 cycles:u # 3.362 GHz (74.96%) + 6,729,741 stalled-cycles-frontend:u # 0.06% frontend cycles idle (75.07%) + 247,672,337 stalled-cycles-backend:u # 2.27% backend cycles idle (75.10%) + 35,420,582,291 instructions:u # 3.25 insn per cycle + # 0.01 stalled cycles per insn (75.10%) + 3.247331196 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 469) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 Avg ME (F77/C++) = 2.0288199088536203 Relative difference = 4.4925808981097166e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/check.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.581826e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.299614e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.299614e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 -TOTAL : 1.678445 sec - 5,077,833,467 cycles # 3.017 GHz - 14,044,832,081 instructions # 2.77 insn per cycle - 1.684690456 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 7.673833e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.335478e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.335478e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079573e+00 +- 3.404713e-03 ) GeV^0 +TOTAL : 1.509011 sec + 5,063,533,669 cycles:u # 3.297 GHz (75.00%) + 7,796,156 stalled-cycles-frontend:u # 0.15% frontend cycles idle (75.00%) + 1,328,081,677 stalled-cycles-backend:u # 26.23% backend cycles idle (75.00%) + 14,007,267,862 instructions:u # 2.77 insn per cycle + # 0.09 stalled cycles per insn (75.00%) + 1.539528991 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2487) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028819e+00 -Avg ME (F77/C++) = 2.0288192554144189 -Relative difference = 1.2589315209891237e-07 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028820e+00 +Avg ME (F77/C++) = 2.0288198892958462 +Relative difference = 5.4565783974899003e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/check.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.968238e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.961635e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.961635e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414230e-03 ) GeV^0 -TOTAL : 1.401665 sec - 3,995,496,807 cycles # 2.840 GHz - 8,629,164,416 instructions # 2.16 insn per cycle - 1.407752568 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3422) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288180815987289 -Relative difference = 4.021983692325164e-08 -OK (relative difference <= 5E-3) +EvtsPerSec[Rmb+ME] (23) = ( 9.945097e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.099043e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.099043e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.079551e+00 +- 3.404208e-03 ) GeV^0 +TOTAL : 1.195679 sec + 3,997,440,574 cycles:u # 3.270 GHz (74.65%) + 7,909,830 stalled-cycles-frontend:u # 0.20% frontend cycles idle (74.87%) + 1,422,447,437 stalled-cycles-backend:u # 35.58% backend cycles idle (75.13%) + 8,582,353,514 instructions:u # 2.15 insn per cycle + # 0.17 stalled cycles per insn (75.13%) + 1.225635820 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3406) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/check.exe -p 2048 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.704330e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.914973e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.914973e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414230e-03 ) GeV^0 -TOTAL : 1.290447 sec - 3,691,505,793 cycles # 2.850 GHz - 8,100,617,850 instructions # 2.19 insn per cycle - 1.296502001 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3105) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288180815987289 -Relative difference = 4.021983692325164e-08 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028819e+00 +Avg ME (F77/C++) = 2.0288186836987734 +Relative difference = 1.559041129563128e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/check.exe -p 2048 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.113348e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.685290e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.685290e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.800816 sec - 3,588,483,895 cycles # 1.987 GHz - 7,373,337,766 instructions # 2.05 insn per cycle - 1.806673377 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2803) (512y: 1) (512z: 2230) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/runTest.exe -[ PASSED ] 6 tests. +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288183569209650 -Relative difference = 1.7592557106041962e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt index 87374f3780..7755f71ce4 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt @@ -1,209 +1,168 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +HASCURAND=hasNoCurand +HASHIPRAND=hasHiprand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-02-02_16:34:57 +DATE: 2024-02-08_18:20:55 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.031501e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.139082e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.267702e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.541586 sec - 2,196,377,842 cycles # 2.814 GHz - 3,120,246,937 instructions # 1.42 insn per cycle - 0.858400490 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 2.904056e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.014821e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.070633e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.087161e+00 +- 3.410053e-03 ) GeV^0 +TOTAL : 0.388289 sec + 864,348,193 cycles:u # 2.121 GHz (73.88%) + 2,130,510 stalled-cycles-frontend:u # 0.25% frontend cycles idle (74.67%) + 5,219,675 stalled-cycles-backend:u # 0.60% backend cycles idle (76.29%) + 1,361,129,233 instructions:u # 1.57 insn per cycle + # 0.00 stalled cycles per insn (76.46%) + 0.442760013 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 2.028807e+00 -Avg ME (F77/CUDA) = 2.0288063423243874 -Relative difference = 3.241686432649386e-07 +Avg ME (F77/CUDA) = 2.0288063423243869 +Relative difference = 3.241686434838304e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.045030e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.104952e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.104952e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 5.237057 sec - 15,229,413,354 cycles # 2.905 GHz - 39,293,839,753 instructions # 2.58 insn per cycle - 5.246210519 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.216739e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.273764e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.273764e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 4.926618 sec + 15,301,124,701 cycles:u # 3.086 GHz (75.00%) + 9,211,660 stalled-cycles-frontend:u # 0.06% frontend cycles idle (74.99%) + 209,883,841 stalled-cycles-backend:u # 1.37% backend cycles idle (74.99%) + 39,304,216,708 instructions:u # 2.57 insn per cycle + # 0.01 stalled cycles per insn (75.01%) + 4.960995675 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 740) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063903750300 Relative difference = 3.0048445715164216e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.584464e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.786578e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.786578e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.034999 sec - 8,833,525,150 cycles # 2.905 GHz - 24,093,446,753 instructions # 2.73 insn per cycle - 3.052140649 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.145121e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.361455e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.361455e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 2.726938 sec + 8,377,842,264 cycles:u # 3.037 GHz (74.95%) + 9,175,705 stalled-cycles-frontend:u # 0.11% frontend cycles idle (75.07%) + 869,503,656 stalled-cycles-backend:u # 10.38% backend cycles idle (75.07%) + 24,090,997,954 instructions:u # 2.88 insn per cycle + # 0.04 stalled cycles per insn (75.07%) + 2.762975684 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2102) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063903750300 Relative difference = 3.0048445715164216e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.499659e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.985026e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.985026e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.017169 sec - 5,479,557,597 cycles # 2.708 GHz - 11,449,041,439 instructions # 2.09 insn per cycle - 2.031726068 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2467) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 6.969064e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.490580e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.490580e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 1.687044 sec + 5,054,763,540 cycles:u # 2.943 GHz (74.85%) + 9,457,236 stalled-cycles-frontend:u # 0.19% frontend cycles idle (74.89%) + 470,935,529 stalled-cycles-backend:u # 9.32% backend cycles idle (74.89%) + 11,453,581,126 instructions:u # 2.27 insn per cycle + # 0.04 stalled cycles per insn (74.86%) + 1.721716287 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2451) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063930599014 Relative difference = 2.9916108265801754e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check.exe -p 2048 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.458442e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.133057e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.133057e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.733855 sec - 4,781,796,134 cycles # 2.748 GHz - 10,317,356,829 instructions # 2.16 insn per cycle - 1.750510846 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2076) (512y: 133) (512z: 0) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063930599014 -Relative difference = 2.9916108265801754e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check.exe -p 2048 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.102992e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.366243e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.366243e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.669332 sec - 4,846,427,781 cycles # 1.812 GHz - 7,366,959,454 instructions # 1.52 insn per cycle - 2.686758075 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1366) (512y: 69) (512z: 1611) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063930599014 -Relative difference = 2.9916108265801754e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt index 0569c05202..aca89217a0 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt @@ -1,209 +1,168 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +HASCURAND=hasNoCurand +HASHIPRAND=hasHiprand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_m_inl0_hrd1' +CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-02-02_16:35:25 +DATE: 2024-02-08_18:21:19 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.024521e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.134296e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.271070e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.538906 sec - 2,208,920,645 cycles # 2.839 GHz - 3,114,971,809 instructions # 1.41 insn per cycle - 0.848861344 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 208 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 2.854985e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.926967e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.980718e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.087161e+00 +- 3.410053e-03 ) GeV^0 +TOTAL : 0.390795 sec + 884,636,377 cycles:u # 2.137 GHz (74.00%) + 2,082,258 stalled-cycles-frontend:u # 0.24% frontend cycles idle (73.82%) + 5,724,825 stalled-cycles-backend:u # 0.65% backend cycles idle (74.93%) + 1,447,624,356 instructions:u # 1.64 insn per cycle + # 0.00 stalled cycles per insn (75.12%) + 0.443869891 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 2.028807e+00 -Avg ME (F77/CUDA) = 2.0288063423243874 -Relative difference = 3.241686432649386e-07 +Avg ME (F77/CUDA) = 2.0288063423243869 +Relative difference = 3.241686434838304e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/check.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.077520e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.138470e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.138470e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 5.156475 sec - 15,070,701,440 cycles # 2.920 GHz - 40,114,901,053 instructions # 2.66 insn per cycle - 5.165730389 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.186066e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.238603e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.238603e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 4.987560 sec + 15,495,821,711 cycles:u # 3.088 GHz (74.98%) + 9,990,162 stalled-cycles-frontend:u # 0.06% frontend cycles idle (74.98%) + 10,915,295 stalled-cycles-backend:u # 0.07% backend cycles idle (74.99%) + 40,144,709,771 instructions:u # 2.59 insn per cycle + # 0.00 stalled cycles per insn (74.99%) + 5.021690865 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 630) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063903750300 Relative difference = 3.0048445715164216e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/check.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.603756e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.809488e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.809488e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.019218 sec - 8,678,864,495 cycles # 2.869 GHz - 23,533,854,594 instructions # 2.71 insn per cycle - 3.038108808 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.063270e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.270255e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.270255e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 2.775738 sec + 8,514,738,248 cycles:u # 3.033 GHz (74.93%) + 11,036,987 stalled-cycles-frontend:u # 0.13% frontend cycles idle (74.95%) + 676,892,006 stalled-cycles-backend:u # 7.95% backend cycles idle (74.94%) + 23,573,068,748 instructions:u # 2.77 insn per cycle + # 0.03 stalled cycles per insn (74.95%) + 2.811372386 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1993) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063903750300 Relative difference = 3.0048445715164216e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/check.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.025592e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.418018e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.418018e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.195864 sec - 6,167,419,394 cycles # 2.801 GHz - 13,102,886,049 instructions # 2.12 insn per cycle - 2.211451093 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2711) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 6.224432e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.662940e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.662940e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 1.882137 sec + 5,711,671,390 cycles:u # 2.984 GHz (74.93%) + 9,151,587 stalled-cycles-frontend:u # 0.16% frontend cycles idle (74.96%) + 710,469,523 stalled-cycles-backend:u # 12.44% backend cycles idle (74.96%) + 13,147,735,569 instructions:u # 2.30 insn per cycle + # 0.05 stalled cycles per insn (74.93%) + 1.917994686 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2695) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063930599014 Relative difference = 2.9916108265801754e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/check.exe -p 2048 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.415025e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.865260e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.865260e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.045632 sec - 5,764,215,972 cycles # 2.810 GHz - 12,211,460,535 instructions # 2.12 insn per cycle - 2.060012903 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2201) (512y: 282) (512z: 0) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063930599014 -Relative difference = 2.9916108265801754e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/check.exe -p 2048 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.979522e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.215324e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.215324e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.743892 sec - 5,260,192,706 cycles # 1.913 GHz - 8,448,878,166 instructions # 1.61 insn per cycle - 2.760577469 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1324) (512y: 84) (512z: 1919) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063930599014 -Relative difference = 2.9916108265801754e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt index 02108b2de1..2d74f3e2ca 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt @@ -1,223 +1,185 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +HASCURAND=hasNoCurand +HASHIPRAND=hasHiprand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-02-02_16:35:54 +DATE: 2024-02-08_18:21:43 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 10 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.647700e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.047128e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.063198e+07 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 0.470011 sec - 1,992,886,570 cycles # 2.916 GHz - 2,848,178,519 instructions # 1.43 insn per cycle - 0.762759403 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 1.896793e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.042103e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.049979e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.872208e+03 +- 2.725298e+03 ) GeV^-2 +TOTAL : 0.438306 sec + 1,018,654,022 cycles:u # 2.283 GHz (75.30%) + 2,342,193 stalled-cycles-frontend:u # 0.23% frontend cycles idle (74.77%) + 5,357,864 stalled-cycles-backend:u # 0.53% backend cycles idle (74.68%) + 1,506,452,711 instructions:u # 1.48 insn per cycle + # 0.00 stalled cycles per insn (74.61%) + 0.486442473 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.048749e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.318342e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.335429e+07 ) sec^-1 -MeanMatrixElemValue = ( 6.734461e+02 +- 4.775415e+02 ) GeV^-2 -TOTAL : 0.619181 sec - 2,446,644,924 cycles # 2.832 GHz - 3,641,461,027 instructions # 1.49 insn per cycle - 0.923142388 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.626166e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.842773e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.848585e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.805651e+03 +- 1.746055e+03 ) GeV^-2 +TOTAL : 0.644740 sec + 1,614,380,153 cycles:u # 2.419 GHz (74.94%) + 2,305,119 stalled-cycles-frontend:u # 0.14% frontend cycles idle (74.90%) + 5,060,661 stalled-cycles-backend:u # 0.31% backend cycles idle (75.06%) + 1,903,252,561 instructions:u # 1.18 insn per cycle + # 0.00 stalled cycles per insn (75.71%) + 0.693964886 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.413122e+00 -Avg ME (F77/CUDA) = 1.4131213684418649 -Relative difference = 4.469239988637851e-07 +Avg ME (F77/CUDA) = 1.4131213684418642 +Relative difference = 4.4692399933517674e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.482496e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.494903e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.494903e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 6.624642 sec - 19,529,563,717 cycles # 2.947 GHz - 57,921,760,115 instructions # 2.97 insn per cycle - 6.632171417 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.664905e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.676215e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.676215e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 +TOTAL : 6.174747 sec + 19,594,832,239 cycles:u # 3.161 GHz (74.97%) + 3,179,318 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.96%) + 3,393,997,547 stalled-cycles-backend:u # 17.32% backend cycles idle (74.96%) + 57,922,192,784 instructions:u # 2.96 insn per cycle + # 0.06 stalled cycles per insn (74.92%) + 6.201296621 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1134) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684432431 Relative difference = 4.4692302355460254e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.824705e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.870001e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.870001e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 3.418984 sec - 10,197,860,001 cycles # 2.979 GHz - 29,945,021,208 instructions # 2.94 insn per cycle - 3.437108833 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 5.445350e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.492063e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.492063e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 +TOTAL : 3.035062 sec + 9,609,956,734 cycles:u # 3.142 GHz (74.92%) + 2,571,796 stalled-cycles-frontend:u # 0.03% frontend cycles idle (74.89%) + 2,372,598,985 stalled-cycles-backend:u # 24.69% backend cycles idle (74.90%) + 29,983,159,273 instructions:u # 3.12 insn per cycle + # 0.08 stalled cycles per insn (74.94%) + 3.061682288 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 4742) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684432433 Relative difference = 4.46923023397472e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.413328e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.581718e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.581718e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.763624 sec - 4,911,018,728 cycles # 2.777 GHz - 11,211,073,816 instructions # 2.28 insn per cycle - 1.778147386 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4396) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.097086e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.115819e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.115819e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 +TOTAL : 1.519592 sec + 4,808,370,439 cycles:u # 3.115 GHz (74.74%) + 2,569,559 stalled-cycles-frontend:u # 0.05% frontend cycles idle (74.96%) + 1,462,517,151 stalled-cycles-backend:u # 30.42% backend cycles idle (75.13%) + 11,197,235,575 instructions:u # 2.33 insn per cycle + # 0.13 stalled cycles per insn (75.13%) + 1.547014856 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4378) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684416484 Relative difference = 4.469241520660492e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.083906e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.106566e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.106566e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.534874 sec - 4,298,734,637 cycles # 2.793 GHz - 10,188,521,401 instructions # 2.37 insn per cycle - 1.548247231 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3895) (512y: 81) (512z: 0) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213684416484 -Relative difference = 4.469241520660492e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.700970e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.816446e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.816446e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 2.153155 sec - 3,902,810,168 cycles # 1.809 GHz - 5,709,086,856 instructions # 1.46 insn per cycle - 2.167587173 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1258) (512y: 74) (512z: 3396) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213684416484 -Relative difference = 4.469241520660492e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt index 2413213f70..0163eb03d1 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt @@ -1,240 +1,196 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +HASCURAND=hasNoCurand +HASHIPRAND=hasHiprand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-02-02_17:11:43 +DATE: 2024-02-08_19:09:53 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 10 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.638747e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.778406e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.778406e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 0.493129 sec - 2,063,227,906 cycles # 2.937 GHz - 3,107,629,962 instructions # 1.51 insn per cycle - 0.762401849 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 1.493228e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.015882e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.015882e+06 ) sec^-1 +MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 +TOTAL : 0.564258 sec + 1,652,077,487 cycles:u # 2.814 GHz (74.62%) + 10,611,849 stalled-cycles-frontend:u # 0.64% frontend cycles idle (75.41%) + 258,215,190 stalled-cycles-backend:u # 15.63% backend cycles idle (75.45%) + 2,020,090,968 instructions:u # 1.22 insn per cycle + # 0.13 stalled cycles per insn (75.09%) + 0.610698195 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.695136e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.498135e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.498135e+06 ) sec^-1 -MeanMatrixElemValue = ( 6.734461e+02 +- 4.775415e+02 ) GeV^-2 -TOTAL : 0.829850 sec - 3,175,446,318 cycles # 2.935 GHz - 4,944,886,553 instructions # 1.56 insn per cycle - 1.143726910 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.195589e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.670385e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.670385e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.948724e+03 +- 1.840727e+03 ) GeV^-2 +TOTAL : 1.268176 sec + 3,822,769,864 cycles:u # 2.920 GHz (75.21%) + 30,801,656 stalled-cycles-frontend:u # 0.81% frontend cycles idle (74.97%) + 859,824,473 stalled-cycles-backend:u # 22.49% backend cycles idle (75.04%) + 3,882,660,072 instructions:u # 1.02 insn per cycle + # 0.22 stalled cycles per insn (74.99%) + 1.331437909 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.413122e+00 -Avg ME (F77/CUDA) = 1.4131213684418649 -Relative difference = 4.469239988637851e-07 +Avg ME (F77/CUDA) = 1.4131213684418642 +Relative difference = 4.4692399933517674e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.550890e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.563789e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.563789e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 6.452640 sec - 19,539,158,300 cycles # 3.026 GHz - 57,927,205,889 instructions # 2.96 insn per cycle - 6.457892701 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.949222e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.961653e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.961653e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 +TOTAL : 5.583285 sec + 19,616,988,865 cycles:u # 3.499 GHz (74.97%) + 3,125,909 stalled-cycles-frontend:u # 0.02% frontend cycles idle (75.03%) + 3,422,426,818 stalled-cycles-backend:u # 17.45% backend cycles idle (75.03%) + 57,899,685,241 instructions:u # 2.95 insn per cycle + # 0.06 stalled cycles per insn (75.03%) + 5.608407195 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1134) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684432431 Relative difference = 4.4692302355460254e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.849193e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.895463e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.895463e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 3.408729 sec - 10,236,712,849 cycles # 3.001 GHz - 29,991,551,658 instructions # 2.93 insn per cycle - 3.414236691 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 6.053353e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.104745e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.104745e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 +TOTAL : 2.734583 sec + 9,609,540,434 cycles:u # 3.485 GHz (75.05%) + 1,957,694 stalled-cycles-frontend:u # 0.02% frontend cycles idle (75.05%) + 2,381,141,376 stalled-cycles-backend:u # 24.78% backend cycles idle (74.91%) + 29,996,044,463 instructions:u # 3.12 insn per cycle + # 0.08 stalled cycles per insn (74.91%) + 2.761024409 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 4742) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684432433 Relative difference = 4.46923023397472e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.525564e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.704438e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.704438e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.750561 sec - 4,951,427,306 cycles # 2.822 GHz - 11,259,386,014 instructions # 2.27 insn per cycle - 1.757443561 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4396) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213684416484 -Relative difference = 4.469241520660492e-07 -OK (relative difference <= 5E-3) +EvtsPerSec[Rmb+ME] (23) = ( 1.239724e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.261065e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.261065e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 +TOTAL : 1.349709 sec + 4,754,036,505 cycles:u # 3.464 GHz (74.94%) + 2,353,246 stalled-cycles-frontend:u # 0.05% frontend cycles idle (74.94%) + 1,461,374,334 stalled-cycles-backend:u # 30.74% backend cycles idle (74.94%) + 11,263,062,636 instructions:u # 2.37 insn per cycle + # 0.13 stalled cycles per insn (74.94%) + 1.375449114 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4378) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.093981e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.117302e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.117302e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.527374 sec - 4,339,294,073 cycles # 2.834 GHz - 10,236,150,971 instructions # 2.36 insn per cycle - 1.532576678 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3895) (512y: 81) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684416484 Relative difference = 4.469241520660492e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.888608e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.013696e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.013696e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 2.108343 sec - 3,945,448,685 cycles # 1.868 GHz - 5,745,888,089 instructions # 1.46 insn per cycle - 2.113640206 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1258) (512y: 74) (512z: 3396) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213684416484 -Relative difference = 4.469241520660492e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt index 0180ae742c..884bcfad54 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt @@ -1,223 +1,185 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +HASCURAND=hasNoCurand +HASHIPRAND=hasHiprand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd1' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-02-02_16:36:24 +DATE: 2024-02-08_18:22:10 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 10 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.433043e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.037683e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.055051e+07 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 0.467578 sec - 1,969,111,241 cycles # 2.878 GHz - 2,826,460,647 instructions # 1.44 insn per cycle - 0.755352341 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 1.906147e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.039261e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.046147e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.872208e+03 +- 2.725298e+03 ) GeV^-2 +TOTAL : 0.439845 sec + 1,023,928,489 cycles:u # 2.320 GHz (74.84%) + 2,294,233 stalled-cycles-frontend:u # 0.22% frontend cycles idle (75.09%) + 5,690,077 stalled-cycles-backend:u # 0.56% backend cycles idle (74.23%) + 1,552,350,558 instructions:u # 1.52 insn per cycle + # 0.00 stalled cycles per insn (72.68%) + 0.484797136 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.036298e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.305213e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.321315e+07 ) sec^-1 -MeanMatrixElemValue = ( 6.734461e+02 +- 4.775415e+02 ) GeV^-2 -TOTAL : 0.608611 sec - 2,463,237,160 cycles # 2.896 GHz - 3,725,514,816 instructions # 1.51 insn per cycle - 0.911446401 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.592965e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.792693e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.797749e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.805651e+03 +- 1.746055e+03 ) GeV^-2 +TOTAL : 0.644636 sec + 1,588,228,432 cycles:u # 2.368 GHz (75.02%) + 2,221,853 stalled-cycles-frontend:u # 0.14% frontend cycles idle (75.02%) + 5,385,824 stalled-cycles-backend:u # 0.34% backend cycles idle (75.03%) + 1,972,260,150 instructions:u # 1.24 insn per cycle + # 0.00 stalled cycles per insn (74.42%) + 0.694016128 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.413122e+00 -Avg ME (F77/CUDA) = 1.4131213684418649 -Relative difference = 4.469239988637851e-07 +Avg ME (F77/CUDA) = 1.4131213684418642 +Relative difference = 4.4692399933517674e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.502116e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.514776e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.514776e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 6.572307 sec - 19,511,835,294 cycles # 2.968 GHz - 57,748,497,183 instructions # 2.96 insn per cycle - 6.579569440 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.634125e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.644901e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.644901e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 +TOTAL : 6.245570 sec + 20,056,849,486 cycles:u # 3.200 GHz (74.99%) + 2,954,288 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.99%) + 3,112,319,221 stalled-cycles-backend:u # 15.52% backend cycles idle (74.99%) + 57,703,493,427 instructions:u # 2.88 insn per cycle + # 0.05 stalled cycles per insn (74.99%) + 6.271210651 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1087) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684432431 Relative difference = 4.4692302355460254e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.719110e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.762501e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.762501e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 3.497147 sec - 10,260,653,948 cycles # 2.932 GHz - 30,333,939,390 instructions # 2.96 insn per cycle - 3.513307032 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 5.268056e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.312584e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.312584e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 +TOTAL : 3.136288 sec + 9,733,143,847 cycles:u # 3.080 GHz (74.94%) + 2,615,873 stalled-cycles-frontend:u # 0.03% frontend cycles idle (74.94%) + 2,287,852,817 stalled-cycles-backend:u # 23.51% backend cycles idle (74.94%) + 30,344,746,875 instructions:u # 3.12 insn per cycle + # 0.08 stalled cycles per insn (74.96%) + 3.163945104 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 4806) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684432433 Relative difference = 4.46923023397472e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.806876e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.962575e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.962575e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.884417 sec - 5,061,109,783 cycles # 2.680 GHz - 11,665,012,561 instructions # 2.30 insn per cycle - 1.896543423 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4489) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.054056e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.071587e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.071587e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 +TOTAL : 1.580938 sec + 4,894,522,264 cycles:u # 3.049 GHz (75.08%) + 1,978,954 stalled-cycles-frontend:u # 0.04% frontend cycles idle (75.08%) + 1,691,604,797 stalled-cycles-backend:u # 34.56% backend cycles idle (75.09%) + 11,675,651,815 instructions:u # 2.39 insn per cycle + # 0.14 stalled cycles per insn (75.09%) + 1.608574799 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4471) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684416484 Relative difference = 4.469241520660492e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/check.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.010188e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.029607e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.029607e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.644870 sec - 4,611,507,492 cycles # 2.796 GHz - 10,806,422,331 instructions # 2.34 insn per cycle - 1.660585667 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3988) (512y: 237) (512z: 0) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213684416484 -Relative difference = 4.469241520660492e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/check.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.574680e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.689797e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.689797e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 2.188611 sec - 3,952,386,207 cycles # 1.802 GHz - 5,998,821,802 instructions # 1.52 insn per cycle - 2.200930358 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1241) (512y: 81) (512z: 3500) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213684416484 -Relative difference = 4.469241520660492e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt index 85745d58f2..3375e46eaf 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt @@ -1,223 +1,185 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +HASCURAND=hasNoCurand +HASHIPRAND=hasHiprand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-02-02_16:36:54 +DATE: 2024-02-08_18:22:37 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 10 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.316523e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.262832e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.370668e+07 ) sec^-1 -MeanMatrixElemValue = ( 1.008472e+02 +- 5.002447e+01 ) GeV^-2 -TOTAL : 0.450538 sec - 1,931,886,459 cycles # 2.904 GHz - 2,736,867,215 instructions # 1.42 insn per cycle - 0.740934676 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 254 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 4.934586e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.876139e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.928934e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.415474e+04 +- 1.288238e+04 ) GeV^-2 +TOTAL : 0.383720 sec + 822,049,704 cycles:u # 2.134 GHz (74.53%) + 2,208,723 stalled-cycles-frontend:u # 0.27% frontend cycles idle (76.04%) + 4,843,042 stalled-cycles-backend:u # 0.59% backend cycles idle (75.16%) + 1,393,763,752 instructions:u # 1.70 insn per cycle + # 0.00 stalled cycles per insn (74.22%) + 0.430994484 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.048324e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.390949e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.489780e+07 ) sec^-1 -MeanMatrixElemValue = ( 6.630099e+02 +- 4.770719e+02 ) GeV^-2 -TOTAL : 0.501563 sec - 2,120,690,071 cycles # 2.898 GHz - 3,026,799,574 instructions # 1.43 insn per cycle - 0.789192986 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.370264e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.631551e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.636459e+07 ) sec^-1 +MeanMatrixElemValue = ( 1.619620e+05 +- 1.611328e+05 ) GeV^-2 +TOTAL : 0.472621 sec + 1,133,739,668 cycles:u # 2.300 GHz (72.63%) + 2,155,754 stalled-cycles-frontend:u # 0.19% frontend cycles idle (75.19%) + 4,276,204 stalled-cycles-backend:u # 0.38% backend cycles idle (75.55%) + 1,545,443,044 instructions:u # 1.36 insn per cycle + # 0.00 stalled cycles per insn (75.54%) + 0.520942641 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 1.412608e+00 -Avg ME (F77/CUDA) = 1.4132214346515752 -Relative difference = 0.00043425681546129636 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 1.412404e+00 +Avg ME (F77/CUDA) = 1.4131669530965212 +Relative difference = 0.0005401804983001964 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.671760e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.686395e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.686395e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 -TOTAL : 6.154456 sec - 18,176,126,036 cycles # 2.951 GHz - 55,238,282,139 instructions # 3.04 insn per cycle - 6.161684220 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.864581e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.878140e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.878140e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.724764e+02 +- 2.665343e+02 ) GeV^-2 +TOTAL : 5.743938 sec + 17,773,852,419 cycles:u # 3.082 GHz (74.91%) + 2,894,955 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.98%) + 3,669,486,104 stalled-cycles-backend:u # 20.65% backend cycles idle (75.03%) + 55,281,356,088 instructions:u # 3.11 insn per cycle + # 0.07 stalled cycles per insn (75.04%) + 5.770116566 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1229) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.412998e+00 -Avg ME (F77/C++) = 1.4129977771372637 -Relative difference = 1.5772332039074602e-07 +Avg ME (F77/C++) = 1.4129978146120550 +Relative difference = 1.3120184529301602e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.766497e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.924096e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.924096e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 -TOTAL : 1.890070 sec - 5,682,505,245 cycles # 3.000 GHz - 16,128,272,752 instructions # 2.84 insn per cycle - 1.903330515 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 9.491676e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.641990e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.641990e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.724763e+02 +- 2.665342e+02 ) GeV^-2 +TOTAL : 1.750083 sec + 5,438,030,974 cycles:u # 3.066 GHz (74.75%) + 2,225,662 stalled-cycles-frontend:u # 0.04% frontend cycles idle (74.78%) + 1,684,378,735 stalled-cycles-backend:u # 30.97% backend cycles idle (74.99%) + 16,144,361,424 instructions:u # 2.97 insn per cycle + # 0.10 stalled cycles per insn (75.19%) + 1.777268015 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 5205) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.412986e+00 -Avg ME (F77/C++) = 1.4129864902818952 -Relative difference = 3.469828399449743e-07 +Avg ME (F77/C++) = 1.4129857118325333 +Relative difference = 2.039421953066926e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.737724e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.800702e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.800702e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.008855e+02 +- 5.002467e+01 ) GeV^-2 -TOTAL : 0.965546 sec - 2,595,320,414 cycles # 2.685 GHz - 6,087,943,191 instructions # 2.35 insn per cycle - 1.063377404 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4878) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.083535e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.154597e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.154597e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.743733e+02 +- 2.676611e+02 ) GeV^-2 +TOTAL : 0.810354 sec + 2,533,172,837 cycles:u # 3.037 GHz (75.07%) + 1,780,574 stalled-cycles-frontend:u # 0.07% frontend cycles idle (75.07%) + 832,632,139 stalled-cycles-backend:u # 32.87% backend cycles idle (75.07%) + 6,082,444,674 instructions:u # 2.40 insn per cycle + # 0.14 stalled cycles per insn (75.08%) + 0.838024436 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4860) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413316e+00 -Avg ME (F77/C++) = 1.4133158486847037 -Relative difference = 1.0706402269051248e-07 +Avg ME (F77/C++) = 1.4133162680784324 +Relative difference = 1.896804623606238e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.079968e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.166140e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.166140e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.008855e+02 +- 5.002467e+01 ) GeV^-2 -TOTAL : 0.809004 sec - 2,291,761,168 cycles # 2.817 GHz - 5,553,353,487 instructions # 2.42 insn per cycle - 0.823484208 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4415) (512y: 30) (512z: 0) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.413316e+00 -Avg ME (F77/C++) = 1.4133158486847037 -Relative difference = 1.0706402269051248e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.532704e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.580565e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.580565e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.008856e+02 +- 5.002468e+01 ) GeV^-2 -TOTAL : 1.092503 sec - 2,015,471,111 cycles # 1.837 GHz - 3,286,131,399 instructions # 1.63 insn per cycle - 1.108765161 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1905) (512y: 28) (512z: 3597) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.413316e+00 -Avg ME (F77/C++) = 1.4133164031689205 -Relative difference = 2.852645271622733e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt index 1a9250d60d..c81ca50562 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt @@ -1,240 +1,196 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +HASCURAND=hasNoCurand +HASHIPRAND=hasHiprand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-02-02_17:12:13 +DATE: 2024-02-08_19:10:18 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 10 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.008267e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.160775e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.160775e+07 ) sec^-1 -MeanMatrixElemValue = ( 1.009071e+02 +- 5.002295e+01 ) GeV^-2 -TOTAL : 0.458677 sec - 1,965,032,172 cycles # 2.936 GHz - 2,899,429,257 instructions # 1.48 insn per cycle - 0.727330127 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -==PROF== Profiling "sigmaKin": launch__registers_per_thread 254 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 3.273252e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.588256e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.588256e+06 ) sec^-1 +MeanMatrixElemValue = ( 4.755516e+02 +- 2.671055e+02 ) GeV^-2 +TOTAL : 0.499256 sec + 1,455,482,979 cycles:u # 2.784 GHz (73.33%) + 10,630,229 stalled-cycles-frontend:u # 0.73% frontend cycles idle (73.99%) + 276,661,876 stalled-cycles-backend:u # 19.01% backend cycles idle (76.12%) + 1,855,040,061 instructions:u # 1.27 insn per cycle + # 0.15 stalled cycles per insn (76.37%) + 0.546123141 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.765178e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.594179e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.594179e+07 ) sec^-1 -MeanMatrixElemValue = ( 6.737500e+02 +- 4.776370e+02 ) GeV^-2 -TOTAL : 0.636101 sec - 2,555,953,093 cycles # 2.945 GHz - 3,910,584,191 instructions # 1.53 insn per cycle - 0.925538331 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.140904e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.468511e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.468511e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.855934e+03 +- 1.791981e+03 ) GeV^-2 +TOTAL : 1.068236 sec + 3,219,691,880 cycles:u # 2.925 GHz (75.29%) + 29,579,411 stalled-cycles-frontend:u # 0.92% frontend cycles idle (75.36%) + 859,498,606 stalled-cycles-backend:u # 26.70% backend cycles idle (75.33%) + 3,423,691,451 instructions:u # 1.06 insn per cycle + # 0.25 stalled cycles per insn (75.39%) + 1.125475723 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 1.412608e+00 -Avg ME (F77/CUDA) = 1.4132214346515752 -Relative difference = 0.00043425681546129636 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 1.412404e+00 +Avg ME (F77/CUDA) = 1.4131669530965212 +Relative difference = 0.0005401804983001964 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.726659e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.741757e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.741757e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 -TOTAL : 6.034153 sec - 18,196,817,282 cycles # 3.015 GHz - 55,243,539,762 instructions # 3.04 insn per cycle - 6.039211086 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.256660e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.272135e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.272135e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.724764e+02 +- 2.665343e+02 ) GeV^-2 +TOTAL : 5.054670 sec + 17,761,310,035 cycles:u # 3.498 GHz (74.95%) + 2,689,568 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.95%) + 3,663,845,845 stalled-cycles-backend:u # 20.63% backend cycles idle (74.96%) + 55,301,407,855 instructions:u # 3.11 insn per cycle + # 0.07 stalled cycles per insn (75.00%) + 5.079610960 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1229) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.412998e+00 -Avg ME (F77/C++) = 1.4129977771372637 -Relative difference = 1.5772332039074602e-07 +Avg ME (F77/C++) = 1.4129978146120550 +Relative difference = 1.3120184529301602e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.790717e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.946780e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.946780e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 -TOTAL : 1.889033 sec - 5,703,594,876 cycles # 3.014 GHz - 16,175,359,206 instructions # 2.84 insn per cycle - 1.894100782 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.061177e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.077735e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.077735e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.724763e+02 +- 2.665342e+02 ) GeV^-2 +TOTAL : 1.568327 sec + 5,535,132,011 cycles:u # 3.479 GHz (74.86%) + 2,505,823 stalled-cycles-frontend:u # 0.05% frontend cycles idle (74.86%) + 1,722,362,524 stalled-cycles-backend:u # 31.12% backend cycles idle (74.86%) + 16,224,209,549 instructions:u # 2.93 insn per cycle + # 0.11 stalled cycles per insn (74.90%) + 1.594493860 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 5205) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.412986e+00 -Avg ME (F77/C++) = 1.4129864902818952 -Relative difference = 3.469828399449743e-07 +Avg ME (F77/C++) = 1.4129857118325333 +Relative difference = 2.039421953066926e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.835404e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.902784e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.902784e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.008855e+02 +- 5.002467e+01 ) GeV^-2 -TOTAL : 0.917789 sec - 2,606,304,513 cycles # 2.829 GHz - 6,121,685,348 instructions # 2.35 insn per cycle - 0.922536991 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4878) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.413316e+00 -Avg ME (F77/C++) = 1.4133158486847037 -Relative difference = 1.0706402269051248e-07 -OK (relative difference <= 5E-3) +EvtsPerSec[Rmb+ME] (23) = ( 2.348338e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.427878e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.427878e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.743733e+02 +- 2.676611e+02 ) GeV^-2 +TOTAL : 0.721667 sec + 2,578,926,819 cycles:u # 3.465 GHz (74.41%) + 2,120,454 stalled-cycles-frontend:u # 0.08% frontend cycles idle (74.92%) + 821,948,498 stalled-cycles-backend:u # 31.87% backend cycles idle (75.28%) + 6,131,768,654 instructions:u # 2.38 insn per cycle + # 0.13 stalled cycles per insn (75.29%) + 0.747585285 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4860) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.086689e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.172924e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.172924e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.008855e+02 +- 5.002467e+01 ) GeV^-2 -TOTAL : 0.810523 sec - 2,308,468,251 cycles # 2.834 GHz - 5,588,973,181 instructions # 2.42 insn per cycle - 0.815616294 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4415) (512y: 30) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413316e+00 -Avg ME (F77/C++) = 1.4133158486847037 -Relative difference = 1.0706402269051248e-07 +Avg ME (F77/C++) = 1.4133162680784324 +Relative difference = 1.896804623606238e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.487605e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.533770e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.533770e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.008856e+02 +- 5.002468e+01 ) GeV^-2 -TOTAL : 1.129951 sec - 2,041,408,314 cycles # 1.800 GHz - 3,327,118,208 instructions # 1.63 insn per cycle - 1.135269811 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1905) (512y: 28) (512z: 3597) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.413316e+00 -Avg ME (F77/C++) = 1.4133164031689205 -Relative difference = 2.852645271622733e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt index 22513c5ac3..d59c4504fd 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt @@ -1,223 +1,185 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +HASCURAND=hasNoCurand +HASHIPRAND=hasHiprand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd1' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-02-02_16:37:19 +DATE: 2024-02-08_18:23:01 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 10 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.338258e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.250138e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.357322e+07 ) sec^-1 -MeanMatrixElemValue = ( 1.008472e+02 +- 5.002447e+01 ) GeV^-2 -TOTAL : 0.449553 sec - 1,908,594,838 cycles # 2.863 GHz - 2,683,799,157 instructions # 1.41 insn per cycle - 0.737936666 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 248 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 6.321746e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.937510e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.078623e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.415474e+04 +- 1.288238e+04 ) GeV^-2 +TOTAL : 0.358230 sec + 815,009,587 cycles:u # 2.143 GHz (74.71%) + 2,398,638 stalled-cycles-frontend:u # 0.29% frontend cycles idle (74.88%) + 5,610,405 stalled-cycles-backend:u # 0.69% backend cycles idle (74.49%) + 1,368,390,070 instructions:u # 1.68 insn per cycle + # 0.00 stalled cycles per insn (74.06%) + 0.405563659 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.011156e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.299478e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.394759e+07 ) sec^-1 -MeanMatrixElemValue = ( 6.630099e+02 +- 4.770719e+02 ) GeV^-2 -TOTAL : 0.505111 sec - 2,069,711,861 cycles # 2.813 GHz - 2,965,164,553 instructions # 1.43 insn per cycle - 0.793558308 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.427126e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.704218e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.710859e+07 ) sec^-1 +MeanMatrixElemValue = ( 1.619620e+05 +- 1.611328e+05 ) GeV^-2 +TOTAL : 0.489379 sec + 1,126,447,874 cycles:u # 2.287 GHz (73.61%) + 2,329,063 stalled-cycles-frontend:u # 0.21% frontend cycles idle (73.92%) + 4,613,066 stalled-cycles-backend:u # 0.41% backend cycles idle (75.63%) + 1,569,017,375 instructions:u # 1.39 insn per cycle + # 0.00 stalled cycles per insn (75.42%) + 0.536446725 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 1.412608e+00 -Avg ME (F77/CUDA) = 1.4132214346515752 -Relative difference = 0.00043425681546129636 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 1.412404e+00 +Avg ME (F77/CUDA) = 1.4131669531526541 +Relative difference = 0.0005401805380429868 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.653049e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.667642e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.667642e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 -TOTAL : 6.201162 sec - 18,131,448,709 cycles # 2.923 GHz - 54,991,482,939 instructions # 3.03 insn per cycle - 6.208401201 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.879133e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.892813e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.892813e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.724764e+02 +- 2.665343e+02 ) GeV^-2 +TOTAL : 5.714809 sec + 17,746,155,755 cycles:u # 3.093 GHz (74.92%) + 3,036,147 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.95%) + 2,957,234,706 stalled-cycles-backend:u # 16.66% backend cycles idle (75.01%) + 55,042,109,625 instructions:u # 3.10 insn per cycle + # 0.05 stalled cycles per insn (75.05%) + 5.740554552 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1171) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.412998e+00 -Avg ME (F77/C++) = 1.4129977771372637 -Relative difference = 1.5772332039074602e-07 +Avg ME (F77/C++) = 1.4129978146120550 +Relative difference = 1.3120184529301602e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.997563e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.161716e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.161716e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 -TOTAL : 1.841615 sec - 5,531,435,247 cycles # 2.996 GHz - 16,222,794,890 instructions # 2.93 insn per cycle - 1.853021416 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 9.914351e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.007906e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.007906e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.724763e+02 +- 2.665342e+02 ) GeV^-2 +TOTAL : 1.676296 sec + 5,213,242,764 cycles:u # 3.066 GHz (75.06%) + 2,215,862 stalled-cycles-frontend:u # 0.04% frontend cycles idle (75.06%) + 1,518,921,044 stalled-cycles-backend:u # 29.14% backend cycles idle (75.06%) + 16,252,369,662 instructions:u # 3.12 insn per cycle + # 0.09 stalled cycles per insn (75.07%) + 1.703555763 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 5136) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.412986e+00 -Avg ME (F77/C++) = 1.4129863487235070 -Relative difference = 2.4679898241023883e-07 +Avg ME (F77/C++) = 1.4129857712652836 +Relative difference = 1.618803841657786e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.581146e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.630242e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.630242e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.008855e+02 +- 5.002467e+01 ) GeV^-2 -TOTAL : 1.056979 sec - 2,975,573,093 cycles # 2.803 GHz - 6,708,205,721 instructions # 2.25 insn per cycle - 1.072519725 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 5430) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.847863e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.904148e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.904148e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.743733e+02 +- 2.676611e+02 ) GeV^-2 +TOTAL : 0.910488 sec + 2,859,225,307 cycles:u # 3.060 GHz (74.57%) + 2,327,605 stalled-cycles-frontend:u # 0.08% frontend cycles idle (75.18%) + 820,783,449 stalled-cycles-backend:u # 28.71% backend cycles idle (75.18%) + 6,719,051,225 instructions:u # 2.35 insn per cycle + # 0.12 stalled cycles per insn (75.18%) + 0.937940806 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 5412) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413316e+00 -Avg ME (F77/C++) = 1.4133158486847037 -Relative difference = 1.0706402269051248e-07 +Avg ME (F77/C++) = 1.4133162680784324 +Relative difference = 1.896804623606238e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/check.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.749260e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.809837e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.809837e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.008855e+02 +- 5.002467e+01 ) GeV^-2 -TOTAL : 0.957384 sec - 2,703,855,487 cycles # 2.811 GHz - 6,222,502,757 instructions # 2.30 insn per cycle - 0.973369928 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 5056) (512y: 24) (512z: 0) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.413316e+00 -Avg ME (F77/C++) = 1.4133158486847037 -Relative difference = 1.0706402269051248e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd1/check.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.460445e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.502576e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.502576e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.008856e+02 +- 5.002468e+01 ) GeV^-2 -TOTAL : 1.144543 sec - 2,153,040,856 cycles # 1.874 GHz - 3,642,238,621 instructions # 1.69 insn per cycle - 1.160831108 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2070) (512y: 21) (512z: 3922) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.413316e+00 -Avg ME (F77/C++) = 1.4133164031689205 -Relative difference = 2.852645271622733e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt index 23e82f8a02..e4a164fc84 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt @@ -1,223 +1,185 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +HASCURAND=hasNoCurand +HASHIPRAND=hasHiprand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-02-02_16:37:45 +DATE: 2024-02-08_18:23:25 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 10 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.436339e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.034377e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.051148e+07 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 0.470990 sec - 1,973,912,828 cycles # 2.872 GHz - 2,831,326,050 instructions # 1.43 insn per cycle - 0.766129633 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 1.900040e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.044051e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.053092e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.872208e+03 +- 2.725298e+03 ) GeV^-2 +TOTAL : 0.422344 sec + 1,018,676,895 cycles:u # 2.291 GHz (73.42%) + 2,552,424 stalled-cycles-frontend:u # 0.25% frontend cycles idle (71.38%) + 5,008,265 stalled-cycles-backend:u # 0.49% backend cycles idle (73.75%) + 1,523,093,464 instructions:u # 1.50 insn per cycle + # 0.00 stalled cycles per insn (75.12%) + 0.469511377 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.036307e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.309762e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.326340e+07 ) sec^-1 -MeanMatrixElemValue = ( 6.734461e+02 +- 4.775415e+02 ) GeV^-2 -TOTAL : 0.615492 sec - 2,520,484,611 cycles # 2.906 GHz - 3,696,111,471 instructions # 1.47 insn per cycle - 0.928115911 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.603353e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.819036e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.824195e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.805651e+03 +- 1.746055e+03 ) GeV^-2 +TOTAL : 0.652036 sec + 1,596,517,317 cycles:u # 2.357 GHz (74.50%) + 2,300,346 stalled-cycles-frontend:u # 0.14% frontend cycles idle (74.84%) + 5,526,422 stalled-cycles-backend:u # 0.35% backend cycles idle (75.02%) + 1,967,893,783 instructions:u # 1.23 insn per cycle + # 0.00 stalled cycles per insn (75.22%) + 0.711593351 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.413122e+00 -Avg ME (F77/CUDA) = 1.4131213755569487 -Relative difference = 4.418889885423659e-07 +Avg ME (F77/CUDA) = 1.4131213755569483 +Relative difference = 4.4188898885662695e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.471386e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.483203e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.483203e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 6.652751 sec - 19,947,848,815 cycles # 2.997 GHz - 59,158,461,511 instructions # 2.97 insn per cycle - 6.660202804 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.590946e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.601468e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.601468e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 +TOTAL : 6.349934 sec + 20,094,310,453 cycles:u # 3.152 GHz (74.97%) + 2,850,071 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.03%) + 3,928,007,749 stalled-cycles-backend:u # 19.55% backend cycles idle (75.03%) + 59,145,863,191 instructions:u # 2.94 insn per cycle + # 0.07 stalled cycles per insn (75.03%) + 6.399687674 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1149) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213859069593 Relative difference = 4.345647726386255e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.765450e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.812202e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.812202e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 3.462081 sec - 10,109,564,451 cycles # 2.917 GHz - 29,765,770,491 instructions # 2.94 insn per cycle - 3.475134206 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 5.497517e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.543872e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.543872e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 +TOTAL : 3.005856 sec + 9,511,244,318 cycles:u # 3.140 GHz (74.94%) + 2,365,507 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.92%) + 2,387,042,931 stalled-cycles-backend:u # 25.10% backend cycles idle (74.92%) + 29,810,359,607 instructions:u # 3.13 insn per cycle + # 0.08 stalled cycles per insn (74.84%) + 3.032697298 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 4873) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213792564823 Relative difference = 4.392710025734405e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.473889e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.644743e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.644743e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.752560 sec - 4,875,111,026 cycles # 2.775 GHz - 11,201,068,655 instructions # 2.30 insn per cycle - 1.776029314 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4581) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.132469e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.152112e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.152112e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 +TOTAL : 1.472531 sec + 4,723,246,589 cycles:u # 3.157 GHz (74.95%) + 2,125,901 stalled-cycles-frontend:u # 0.05% frontend cycles idle (74.87%) + 1,578,292,094 stalled-cycles-backend:u # 33.42% backend cycles idle (74.87%) + 11,191,081,410 instructions:u # 2.37 insn per cycle + # 0.14 stalled cycles per insn (74.87%) + 1.499599512 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4563) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213600217192 Relative difference = 4.5288254008796884e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/check.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.107264e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.130449e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.130449e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.503097 sec - 4,226,957,714 cycles # 2.804 GHz - 10,145,643,692 instructions # 2.40 insn per cycle - 1.515377925 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4064) (512y: 73) (512z: 0) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213600217192 -Relative difference = 4.5288254008796884e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/check.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.622284e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.731515e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.731515e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 2.174918 sec - 3,998,997,415 cycles # 1.835 GHz - 5,838,720,700 instructions # 1.46 insn per cycle - 2.186383197 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1778) (512y: 97) (512z: 3502) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213600217192 -Relative difference = 4.5288254008796884e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt index 22c798e81e..bbf72bf4d2 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt @@ -1,223 +1,185 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +HASCURAND=hasNoCurand +HASHIPRAND=hasHiprand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_m_inl0_hrd1' +CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.none_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-02-02_16:38:15 +DATE: 2024-02-08_18:23:52 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 10 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.417705e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.038162e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.054371e+07 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 0.468594 sec - 1,994,687,709 cycles # 2.918 GHz - 2,872,514,636 instructions # 1.44 insn per cycle - 0.755044501 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 1.881819e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.030373e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.036991e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.872208e+03 +- 2.725298e+03 ) GeV^-2 +TOTAL : 0.418757 sec + 1,057,766,788 cycles:u # 2.417 GHz (73.84%) + 2,187,648 stalled-cycles-frontend:u # 0.21% frontend cycles idle (74.59%) + 5,024,903 stalled-cycles-backend:u # 0.48% backend cycles idle (75.85%) + 1,531,993,399 instructions:u # 1.45 insn per cycle + # 0.00 stalled cycles per insn (76.30%) + 0.460795481 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.034176e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.306216e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.322752e+07 ) sec^-1 -MeanMatrixElemValue = ( 6.734461e+02 +- 4.775415e+02 ) GeV^-2 -TOTAL : 0.607883 sec - 2,449,459,716 cycles # 2.875 GHz - 3,629,898,800 instructions # 1.48 insn per cycle - 0.911271074 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.593384e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.833698e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.838917e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.805651e+03 +- 1.746055e+03 ) GeV^-2 +TOTAL : 0.643290 sec + 1,581,021,170 cycles:u # 2.382 GHz (74.39%) + 2,302,325 stalled-cycles-frontend:u # 0.15% frontend cycles idle (74.48%) + 5,629,796 stalled-cycles-backend:u # 0.36% backend cycles idle (74.89%) + 1,950,257,352 instructions:u # 1.23 insn per cycle + # 0.00 stalled cycles per insn (75.96%) + 0.690981216 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.413122e+00 -Avg ME (F77/CUDA) = 1.4131213755569487 -Relative difference = 4.418889885423659e-07 +Avg ME (F77/CUDA) = 1.4131213755569483 +Relative difference = 4.4188898885662695e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.495235e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.507294e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.507294e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 6.589324 sec - 19,700,296,433 cycles # 2.988 GHz - 58,707,136,540 instructions # 2.98 insn per cycle - 6.596552489 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.585955e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.596594e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.596594e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 +TOTAL : 6.362147 sec + 20,187,670,852 cycles:u # 3.162 GHz (74.94%) + 2,859,211 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.94%) + 3,681,425,458 stalled-cycles-backend:u # 18.24% backend cycles idle (74.95%) + 58,740,442,273 instructions:u # 2.91 insn per cycle + # 0.06 stalled cycles per insn (75.01%) + 6.387668065 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1026) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213859069593 Relative difference = 4.345647726386255e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.820319e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.867843e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.867843e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 3.426180 sec - 10,121,028,388 cycles # 2.952 GHz - 30,159,143,099 instructions # 2.98 insn per cycle - 3.439813193 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 5.563902e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.612203e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.612203e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 +TOTAL : 2.970623 sec + 9,394,603,985 cycles:u # 3.137 GHz (74.89%) + 2,356,159 stalled-cycles-frontend:u # 0.03% frontend cycles idle (74.89%) + 2,113,017,475 stalled-cycles-backend:u # 22.49% backend cycles idle (74.91%) + 30,189,438,679 instructions:u # 3.21 insn per cycle + # 0.07 stalled cycles per insn (75.03%) + 2.998060317 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 4944) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213792564823 Relative difference = 4.392710025734405e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.352248e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.522746e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.522746e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.775217 sec - 5,038,820,824 cycles # 2.831 GHz - 11,663,824,812 instructions # 2.31 insn per cycle - 1.791617120 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4685) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.097936e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.116705e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.116705e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 +TOTAL : 1.517904 sec + 4,826,653,513 cycles:u # 3.131 GHz (74.84%) + 2,480,491 stalled-cycles-frontend:u # 0.05% frontend cycles idle (75.03%) + 1,558,060,205 stalled-cycles-backend:u # 32.28% backend cycles idle (75.09%) + 11,661,614,013 instructions:u # 2.42 insn per cycle + # 0.13 stalled cycles per insn (75.09%) + 1.545034243 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4667) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213600217192 Relative difference = 4.5288254008796884e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd1/check.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.031398e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.052453e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.052453e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.612533 sec - 4,551,135,269 cycles # 2.815 GHz - 10,787,173,737 instructions # 2.37 insn per cycle - 1.628538481 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4159) (512y: 233) (512z: 0) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213600217192 -Relative difference = 4.5288254008796884e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/check.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.644088e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.753996e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.753996e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 2.167907 sec - 4,052,527,826 cycles # 1.866 GHz - 6,072,984,180 instructions # 1.50 insn per cycle - 2.184116716 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1725) (512y: 104) (512z: 3609) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213600217192 -Relative difference = 4.5288254008796884e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt index 7547cf19b3..51fae84f72 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt @@ -1,223 +1,185 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +HASCURAND=hasNoCurand +HASHIPRAND=hasHiprand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-02-02_16:38:45 +DATE: 2024-02-08_18:24:18 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.454995e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.488376e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.491531e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.530977 sec - 2,245,521,189 cycles # 2.936 GHz - 3,409,805,805 instructions # 1.52 insn per cycle - 0.835043958 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 7.419893e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.570341e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.571892e+04 ) sec^-1 +MeanMatrixElemValue = ( 3.804675e-02 +- 2.047289e-02 ) GeV^-4 +TOTAL : 0.703629 sec + 1,839,027,261 cycles:u # 2.685 GHz (73.35%) + 2,323,224 stalled-cycles-frontend:u # 0.13% frontend cycles idle (74.61%) + 6,233,225 stalled-cycles-backend:u # 0.34% backend cycles idle (75.46%) + 2,082,581,761 instructions:u # 1.13 insn per cycle + # 0.00 stalled cycles per insn (75.55%) + 0.749197644 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.118576e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.159326e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.161073e+05 ) sec^-1 -MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.048274 sec - 9,868,317,269 cycles # 2.975 GHz - 20,508,510,958 instructions # 2.08 insn per cycle - 3.376673967 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.246279e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.248702e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.248758e+05 ) sec^-1 +MeanMatrixElemValue = ( 6.694853e+01 +- 6.364791e+01 ) GeV^-4 +TOTAL : 7.717029 sec + 23,764,997,698 cycles:u # 3.068 GHz (74.98%) + 3,331,784 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.99%) + 5,721,672 stalled-cycles-backend:u # 0.02% backend cycles idle (74.99%) + 19,028,265,618 instructions:u # 0.80 insn per cycle + # 0.00 stalled cycles per insn (75.04%) + 7.771005889 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 6.626675e-04 -Avg ME (F77/CUDA) = 6.6266731198158133E-004 -Relative difference = 2.837296512218831e-07 +Avg ME (F77/CUDA) = 6.6266731198158101E-004 +Relative difference = 2.837296517127185e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.841494e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.842357e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.842357e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 8.917217 sec - 26,450,937,968 cycles # 2.968 GHz - 81,756,801,667 instructions # 3.09 insn per cycle - 8.924534910 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.975465e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.976259e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.976259e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 8.310636 sec + 26,068,173,975 cycles:u # 3.128 GHz (74.95%) + 20,587,445 stalled-cycles-frontend:u # 0.08% frontend cycles idle (74.96%) + 3,852,440,225 stalled-cycles-backend:u # 14.78% backend cycles idle (74.98%) + 81,789,228,591 instructions:u # 3.14 insn per cycle + # 0.05 stalled cycles per insn (75.02%) + 8.336579974 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 6614) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141133E-004 Relative difference = 2.8372990776517314e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.749820e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.753409e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.753409e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.384881 sec - 12,883,920,388 cycles # 2.936 GHz - 39,241,666,790 instructions # 3.05 insn per cycle - 4.400649487 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.509483e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.513670e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.513670e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 3.645427 sec + 11,461,884,241 cycles:u # 3.124 GHz (74.95%) + 1,107,292 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.93%) + 1,663,035,588 stalled-cycles-backend:u # 14.51% backend cycles idle (74.93%) + 39,247,572,695 instructions:u # 3.42 insn per cycle + # 0.04 stalled cycles per insn (74.95%) + 3.672796225 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:12814) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141122E-004 Relative difference = 2.837299079287849e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.414731e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.431885e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.431885e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.959257 sec - 5,556,228,763 cycles # 2.829 GHz - 13,789,278,576 instructions # 2.48 insn per cycle - 1.970607505 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11059) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.079252e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.081598e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.081598e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 1.528018 sec + 4,829,699,409 cycles:u # 3.113 GHz (74.80%) + 707,883 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.75%) + 574,218,895 stalled-cycles-backend:u # 11.89% backend cycles idle (74.79%) + 13,835,747,077 instructions:u # 2.86 insn per cycle + # 0.04 stalled cycles per insn (75.01%) + 1.555123003 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11041) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198157309E-004 Relative difference = 2.837296636563793e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe -p 64 256 1 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.538344e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.560799e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.560799e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.729824 sec - 4,898,369,424 cycles # 2.825 GHz - 12,318,701,579 instructions # 2.51 insn per cycle - 1.746195289 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 9762) (512y: 94) (512z: 0) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198157309E-004 -Relative difference = 2.837296636563793e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe -p 64 256 1 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.516966e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.531137e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.531137e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.193086 sec - 4,057,739,155 cycles # 1.847 GHz - 6,286,877,961 instructions # 1.55 insn per cycle - 2.205149690 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1516) (512y: 94) (512z: 9019) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198157309E-004 -Relative difference = 2.837296636563793e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt index b723053208..1e61565f06 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt @@ -1,240 +1,196 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +HASCURAND=hasNoCurand +HASHIPRAND=hasHiprand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-02-02_17:13:13 +DATE: 2024-02-08_19:11:09 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.142002e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.477843e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.477843e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.515959 sec - 2,170,304,494 cycles # 2.917 GHz - 3,359,236,719 instructions # 1.55 insn per cycle - 0.806165102 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 7.384904e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.522670e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.522670e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 0.661073 sec + 1,963,722,469 cycles:u # 2.878 GHz (75.53%) + 2,436,720 stalled-cycles-frontend:u # 0.12% frontend cycles idle (74.96%) + 53,166,474 stalled-cycles-backend:u # 2.71% backend cycles idle (74.28%) + 2,231,069,227 instructions:u # 1.14 insn per cycle + # 0.02 stalled cycles per insn (74.82%) + 0.706607210 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.629799e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.107541e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.107541e+05 ) sec^-1 -MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.314077 sec - 10,532,397,523 cycles # 2.932 GHz - 23,652,635,041 instructions # 2.25 insn per cycle - 3.649331385 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.209932e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.244631e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.244631e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.252232e+02 +- 1.234346e+02 ) GeV^-4 +TOTAL : 8.541213 sec + 29,335,442,337 cycles:u # 3.417 GHz (74.94%) + 22,204,977 stalled-cycles-frontend:u # 0.08% frontend cycles idle (74.96%) + 1,148,495,427 stalled-cycles-backend:u # 3.92% backend cycles idle (75.02%) + 23,524,172,196 instructions:u # 0.80 insn per cycle + # 0.05 stalled cycles per insn (75.03%) + 8.611621716 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 6.626675e-04 -Avg ME (F77/CUDA) = 6.6266731198158133E-004 -Relative difference = 2.837296512218831e-07 +Avg ME (F77/CUDA) = 6.6266731198158101E-004 +Relative difference = 2.837296517127185e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.877028e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.877950e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.877950e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 8.752534 sec - 26,465,488,132 cycles # 3.023 GHz - 81,758,555,274 instructions # 3.09 insn per cycle - 8.757733786 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.228682e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.229582e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.229582e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 7.370176 sec + 25,875,705,823 cycles:u # 3.500 GHz (75.00%) + 3,536,166 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.01%) + 3,941,158,888 stalled-cycles-backend:u # 15.23% backend cycles idle (75.01%) + 81,723,638,272 instructions:u # 3.16 insn per cycle + # 0.05 stalled cycles per insn (75.00%) + 7.395046657 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 6614) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141133E-004 Relative difference = 2.8372990776517314e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.631623e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.634951e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.634951e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.530261 sec - 12,919,849,016 cycles # 2.849 GHz - 39,254,561,699 instructions # 3.04 insn per cycle - 4.535411374 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 5.047525e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.052116e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.052116e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 3.260880 sec + 11,461,230,297 cycles:u # 3.490 GHz (74.92%) + 1,057,133 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.91%) + 1,664,799,119 stalled-cycles-backend:u # 14.53% backend cycles idle (74.82%) + 39,285,224,142 instructions:u # 3.43 insn per cycle + # 0.04 stalled cycles per insn (74.95%) + 3.287293927 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:12814) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141122E-004 Relative difference = 2.837299079287849e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.374160e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.392029e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.392029e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.972509 sec - 5,579,622,120 cycles # 2.823 GHz - 13,798,934,184 instructions # 2.47 insn per cycle - 1.977992313 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11059) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198157309E-004 -Relative difference = 2.837296636563793e-07 -OK (relative difference <= 5E-3) +EvtsPerSec[Rmb+ME] (23) = ( 1.206630e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.209265e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.209265e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 1.370722 sec + 4,816,403,203 cycles:u # 3.456 GHz (74.75%) + 897,807 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.79%) + 595,702,815 stalled-cycles-backend:u # 12.37% backend cycles idle (75.01%) + 13,825,146,605 instructions:u # 2.87 insn per cycle + # 0.04 stalled cycles per insn (75.26%) + 1.396941426 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11041) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.505420e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.528495e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.528495e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.739505 sec - 4,911,934,991 cycles # 2.817 GHz - 12,327,929,521 instructions # 2.51 insn per cycle - 1.745018547 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 9762) (512y: 94) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198157309E-004 Relative difference = 2.837296636563793e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.517409e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.532638e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.532638e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.195991 sec - 4,070,014,153 cycles # 1.850 GHz - 6,297,376,156 instructions # 1.55 insn per cycle - 2.201248928 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1516) (512y: 94) (512z: 9019) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198157309E-004 -Relative difference = 2.837296636563793e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt index b375875b9a..dc839449b6 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt @@ -1,223 +1,182 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +HASCURAND=hasNoCurand +HASHIPRAND=hasHiprand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-02-02_17:25:10 +DATE: 2024-02-08_19:21:20 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --common OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.475309e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.502643e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.505183e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.371043e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.569495e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.570900e+04 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 0.507932 sec - 2,176,251,343 cycles # 2.937 GHz - 3,441,474,290 instructions # 1.58 insn per cycle - 0.800974039 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --common -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.650484 sec + 1,944,232,578 cycles:u # 2.906 GHz (75.06%) + 2,566,490 stalled-cycles-frontend:u # 0.13% frontend cycles idle (75.53%) + 47,645,331 stalled-cycles-backend:u # 2.45% backend cycles idle (75.78%) + 2,151,229,336 instructions:u # 1.11 insn per cycle + # 0.02 stalled cycles per insn (74.71%) + 0.692677245 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --common OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.134917e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.169049e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.170478e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.245808e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.248801e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.248859e+05 ) sec^-1 MeanMatrixElemValue = ( 1.252232e+02 +- 1.234346e+02 ) GeV^-4 -TOTAL : 3.128176 sec - 10,125,231,079 cycles # 2.996 GHz - 22,243,211,904 instructions # 2.20 insn per cycle - 3.439515240 seconds time elapsed +TOTAL : 8.387158 sec + 28,895,428,422 cycles:u # 3.432 GHz (74.94%) + 11,926,413 stalled-cycles-frontend:u # 0.04% frontend cycles idle (75.05%) + 1,138,859,957 stalled-cycles-backend:u # 3.94% backend cycles idle (75.03%) + 22,682,042,659 instructions:u # 0.78 insn per cycle + # 0.05 stalled cycles per insn (75.02%) + 8.441980465 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 6.626675e-04 -Avg ME (F77/CUDA) = 6.6266731198158133E-004 -Relative difference = 2.837296512218831e-07 +Avg ME (F77/CUDA) = 6.6266731198158101E-004 +Relative difference = 2.837296517127185e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe -p 64 256 1 --common OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe -p 64 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.860528e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.861418e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.861418e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.226063e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.227012e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.227012e+03 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 8.828976 sec - 26,462,400,944 cycles # 2.997 GHz - 81,755,008,473 instructions # 3.09 insn per cycle - 8.834001681 seconds time elapsed +TOTAL : 7.374847 sec + 25,911,004,356 cycles:u # 3.503 GHz (74.98%) + 4,934,775 stalled-cycles-frontend:u # 0.02% frontend cycles idle (75.02%) + 3,929,539,299 stalled-cycles-backend:u # 15.17% backend cycles idle (75.02%) + 81,751,179,129 instructions:u # 3.16 insn per cycle + # 0.05 stalled cycles per insn (75.02%) + 7.398660960 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 6614) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141133E-004 Relative difference = 2.8372990776517314e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 1 --common OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.627498e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.630911e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.630911e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.043786e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.048334e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.048334e+03 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 4.532568 sec - 12,853,517,544 cycles # 2.834 GHz - 39,241,007,221 instructions # 3.05 insn per cycle - 4.537462439 seconds time elapsed +TOTAL : 3.259433 sec + 11,462,316,668 cycles:u # 3.494 GHz (74.89%) + 1,101,286 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.89%) + 1,656,842,817 stalled-cycles-backend:u # 14.45% backend cycles idle (74.92%) + 39,291,812,915 instructions:u # 3.43 insn per cycle + # 0.04 stalled cycles per insn (75.05%) + 3.282743813 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:12814) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141122E-004 Relative difference = 2.837299079287849e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 1 --common OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.374898e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.391697e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.391697e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.207796e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.210426e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.210426e+04 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 1.969618 sec - 5,566,599,702 cycles # 2.821 GHz - 13,787,372,347 instructions # 2.48 insn per cycle - 1.974584508 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11059) (512y: 0) (512z: 0) +TOTAL : 1.366016 sec + 4,823,946,998 cycles:u # 3.476 GHz (74.68%) + 737,483 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.84%) + 604,858,936 stalled-cycles-backend:u # 12.54% backend cycles idle (75.08%) + 13,809,568,520 instructions:u # 2.86 insn per cycle + # 0.04 stalled cycles per insn (75.22%) + 1.389473172 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11041) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198157309E-004 Relative difference = 2.837296636563793e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe -p 64 256 1 --common OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.549006e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.573076e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.573076e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 1.728976 sec - 4,899,376,833 cycles # 2.828 GHz - 12,315,465,343 instructions # 2.51 insn per cycle - 1.733715944 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 9762) (512y: 94) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198157309E-004 -Relative difference = 2.837296636563793e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe -p 64 256 1 --common OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.480774e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.495806e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.495806e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 2.204808 sec - 4,062,625,554 cycles # 1.840 GHz - 6,283,495,821 instructions # 1.55 insn per cycle - 2.209746157 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1516) (512y: 94) (512z: 9019) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198157309E-004 -Relative difference = 2.837296636563793e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_curhst.txt index 760bb1f09a..6bafd9bab6 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_curhst.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_curhst.txt @@ -6,8 +6,9 @@ AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' @@ -41,7 +42,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-02-02_17:21:46 +DATE: 2024-02-05_22:06:43 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +51,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.482840e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.510946e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.513158e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.480734e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.510074e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.512231e+05 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.506050 sec - 2,185,172,799 cycles # 2.927 GHz - 3,335,781,295 instructions # 1.53 insn per cycle - 0.810404737 seconds time elapsed +TOTAL : 0.506367 sec + 2,239,917,145 cycles # 3.011 GHz + 3,475,727,316 instructions # 1.55 insn per cycle + 0.813688193 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --curhst WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -68,14 +69,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.145792e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.180137e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.181577e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.146935e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.181383e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.182805e+05 ) sec^-1 MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.062771 sec - 9,845,230,376 cycles # 2.967 GHz - 21,536,417,739 instructions # 2.19 insn per cycle - 3.374155567 seconds time elapsed +TOTAL : 3.063188 sec + 10,208,493,450 cycles # 3.077 GHz + 23,347,053,488 instructions # 2.29 insn per cycle + 3.373683802 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 @@ -91,14 +92,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.866708e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.867580e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.867580e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.927773e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.928672e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.928672e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 8.795903 sec - 26,433,622,271 cycles # 3.004 GHz - 81,758,988,249 instructions # 3.09 insn per cycle - 8.800798013 seconds time elapsed +TOTAL : 8.518263 sec + 26,413,942,812 cycles # 3.100 GHz + 81,752,027,211 instructions # 3.10 insn per cycle + 8.523020083 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 6614) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe @@ -118,14 +119,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.748003e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.751493e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.751493e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.769339e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.773163e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.773163e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.386339 sec - 12,904,123,788 cycles # 2.940 GHz - 39,240,718,951 instructions # 3.04 insn per cycle - 4.391268199 seconds time elapsed +TOTAL : 4.362035 sec + 12,908,972,832 cycles # 2.957 GHz + 39,242,218,426 instructions # 3.04 insn per cycle + 4.367705377 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:12814) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe @@ -145,14 +146,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.415832e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.434540e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.434540e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.597196e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.615390e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.615390e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.959015 sec - 5,558,864,478 cycles # 2.834 GHz - 13,788,301,741 instructions # 2.48 insn per cycle - 1.963927728 seconds time elapsed +TOTAL : 1.917124 sec + 5,555,350,612 cycles # 2.892 GHz + 13,788,371,660 instructions # 2.48 insn per cycle + 1.921925818 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:11059) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe @@ -172,14 +173,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.494161e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.517000e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.517000e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.768207e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.792227e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.792227e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.737371 sec - 4,896,629,967 cycles # 2.812 GHz - 12,317,684,315 instructions # 2.52 insn per cycle - 1.742251355 seconds time elapsed +TOTAL : 1.688918 sec + 4,898,535,094 cycles # 2.894 GHz + 12,317,895,390 instructions # 2.51 insn per cycle + 1.693783698 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 9762) (512y: 94) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest.exe @@ -199,14 +200,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.544986e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.559315e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.559315e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.389097e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.403636e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.403636e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.183770 sec - 4,054,048,032 cycles # 1.853 GHz - 6,285,163,070 instructions # 1.55 insn per cycle - 2.188518130 seconds time elapsed +TOTAL : 2.229769 sec + 4,080,389,432 cycles # 1.828 GHz + 6,285,644,175 instructions # 1.54 insn per cycle + 2.234633734 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1516) (512y: 94) (512z: 9019) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt index fcc9ac3ce2..2f51666cca 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt @@ -1,226 +1,187 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +HASCURAND=hasNoCurand +HASHIPRAND=hasHiprand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-02-02_17:18:26 +DATE: 2024-02-08_19:17:37 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --rmbhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+MESDEV/none+NAVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.222879e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.536597e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.538907e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.510219 sec - 2,180,036,664 cycles # 2.934 GHz - 3,449,779,265 instructions # 1.58 insn per cycle - 0.804483156 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --rmbhst -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 7.457379e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.590055e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.591542e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 0.647675 sec + 1,950,784,003 cycles:u # 2.898 GHz (74.98%) + 2,955,540 stalled-cycles-frontend:u # 0.15% frontend cycles idle (75.15%) + 38,642,817 stalled-cycles-backend:u # 1.98% backend cycles idle (75.15%) + 2,160,763,754 instructions:u # 1.11 insn per cycle + # 0.02 stalled cycles per insn (74.99%) + 0.689968370 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+MESDEV/none+NAVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.733472e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.173653e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.175138e+05 ) sec^-1 -MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.200793 sec - 10,300,102,656 cycles # 2.982 GHz - 21,726,579,468 instructions # 2.11 insn per cycle - 3.512221005 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.216026e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.248213e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.248271e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.252232e+02 +- 1.234346e+02 ) GeV^-4 +TOTAL : 8.491761 sec + 29,220,773,291 cycles:u # 3.425 GHz (74.96%) + 23,526,526 stalled-cycles-frontend:u # 0.08% frontend cycles idle (74.97%) + 1,139,934,734 stalled-cycles-backend:u # 3.90% backend cycles idle (75.01%) + 23,436,671,159 instructions:u # 0.80 insn per cycle + # 0.05 stalled cycles per insn (75.06%) + 8.550830835 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 6.626675e-04 -Avg ME (F77/CUDA) = 6.6266731198158133E-004 -Relative difference = 2.837296512218831e-07 +Avg ME (F77/CUDA) = 6.6266731198158101E-004 +Relative difference = 2.837296517127185e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.843930e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.844780e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.844780e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 8.904140 sec - 26,441,840,151 cycles # 2.969 GHz - 81,752,619,472 instructions # 3.09 insn per cycle - 8.909139515 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.228857e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.229766e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.229766e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 7.365932 sec + 25,860,325,327 cycles:u # 3.500 GHz (74.99%) + 3,417,700 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.99%) + 3,793,396,241 stalled-cycles-backend:u # 14.67% backend cycles idle (74.99%) + 81,737,950,688 instructions:u # 3.16 insn per cycle + # 0.05 stalled cycles per insn (74.99%) + 7.390284378 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 6614) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141133E-004 Relative difference = 2.8372990776517314e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.748131e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.751701e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.751701e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.386057 sec - 12,901,224,827 cycles # 2.940 GHz - 39,241,205,086 instructions # 3.04 insn per cycle - 4.390920075 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.983789e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.988177e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.988177e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 3.298665 sec + 11,598,734,474 cycles:u # 3.493 GHz (74.95%) + 2,213,945 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.95%) + 1,816,599,463 stalled-cycles-backend:u # 15.66% backend cycles idle (74.95%) + 39,318,197,888 instructions:u # 3.39 insn per cycle + # 0.05 stalled cycles per insn (74.88%) + 3.323078386 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:12814) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141122E-004 Relative difference = 2.837299079287849e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.414540e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.432173e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.432173e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.959579 sec - 5,556,358,156 cycles # 2.830 GHz - 13,788,808,039 instructions # 2.48 insn per cycle - 1.964982375 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11059) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.206033e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.208640e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.208640e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 1.367797 sec + 4,826,714,733 cycles:u # 3.473 GHz (74.68%) + 807,379 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.73%) + 618,304,947 stalled-cycles-backend:u # 12.81% backend cycles idle (75.01%) + 13,810,963,654 instructions:u # 2.86 insn per cycle + # 0.04 stalled cycles per insn (75.26%) + 1.391784798 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11041) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198157309E-004 Relative difference = 2.837296636563793e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.562763e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.585503e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.585503e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.724810 sec - 4,896,110,262 cycles # 2.832 GHz - 12,317,522,283 instructions # 2.52 insn per cycle - 1.729904661 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 9762) (512y: 94) (512z: 0) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198157309E-004 -Relative difference = 2.837296636563793e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.537759e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.552637e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.552637e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.186012 sec - 4,052,613,508 cycles # 1.851 GHz - 6,285,345,754 instructions # 1.55 insn per cycle - 2.191305338 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1516) (512y: 94) (512z: 9019) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198157309E-004 -Relative difference = 2.837296636563793e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt index 12232058d0..5f9cd740db 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt @@ -1,223 +1,185 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +HASCURAND=hasNoCurand +HASHIPRAND=hasHiprand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd1' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-02-02_16:39:22 +DATE: 2024-02-08_18:24:57 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.463480e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.496732e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.499211e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.526100 sec - 2,265,313,349 cycles # 2.942 GHz - 3,486,028,495 instructions # 1.54 insn per cycle - 0.840774322 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 1.405379e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.460398e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.460958e+05 ) sec^-1 +MeanMatrixElemValue = ( 3.804675e-02 +- 2.047289e-02 ) GeV^-4 +TOTAL : 0.528781 sec + 1,385,687,149 cycles:u # 2.539 GHz (74.99%) + 2,277,657 stalled-cycles-frontend:u # 0.16% frontend cycles idle (75.18%) + 5,647,445 stalled-cycles-backend:u # 0.41% backend cycles idle (75.23%) + 1,723,056,293 instructions:u # 1.24 insn per cycle + # 0.00 stalled cycles per insn (75.27%) + 0.573805843 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.123478e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.164078e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.165823e+05 ) sec^-1 -MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.035437 sec - 9,876,464,611 cycles # 2.996 GHz - 19,678,675,992 instructions # 1.99 insn per cycle - 3.354407522 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.739001e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.743812e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.743938e+05 ) sec^-1 +MeanMatrixElemValue = ( 6.694853e+01 +- 6.364791e+01 ) GeV^-4 +TOTAL : 6.363734 sec + 19,704,908,340 cycles:u # 3.083 GHz (74.87%) + 2,953,635 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.90%) + 5,788,233 stalled-cycles-backend:u # 0.03% backend cycles idle (74.97%) + 15,876,708,731 instructions:u # 0.81 insn per cycle + # 0.00 stalled cycles per insn (74.99%) + 6.416629497 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 6.626675e-04 -Avg ME (F77/CUDA) = 6.6266731198158133E-004 -Relative difference = 2.837296512218831e-07 +Avg ME (F77/CUDA) = 6.6266731198158101E-004 +Relative difference = 2.837296517127185e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/check.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.853977e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.854832e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.854832e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 8.859352 sec - 26,471,680,418 cycles # 2.990 GHz - 81,783,434,666 instructions # 3.09 insn per cycle - 8.866882850 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.987007e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.987813e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.987813e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 8.261995 sec + 26,146,192,849 cycles:u # 3.156 GHz (75.00%) + 19,881,039 stalled-cycles-frontend:u # 0.08% frontend cycles idle (74.99%) + 3,292,723,902 stalled-cycles-backend:u # 12.59% backend cycles idle (74.99%) + 81,718,806,984 instructions:u # 3.13 insn per cycle + # 0.04 stalled cycles per insn (74.99%) + 8.288021078 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 6589) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141133E-004 Relative difference = 2.8372990776517314e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/check.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.729651e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.733222e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.733222e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.408104 sec - 12,919,398,917 cycles # 2.928 GHz - 39,248,479,875 instructions # 3.04 insn per cycle - 4.422279604 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.520526e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.524263e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.524263e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 3.635288 sec + 11,497,017,579 cycles:u # 3.143 GHz (74.94%) + 1,199,037 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.03%) + 1,535,871,724 stalled-cycles-backend:u # 13.36% backend cycles idle (75.07%) + 39,259,745,497 instructions:u # 3.41 insn per cycle + # 0.04 stalled cycles per insn (75.07%) + 3.661377334 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:12771) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141122E-004 Relative difference = 2.837299079287849e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/check.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.377146e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.394509e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.394509e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.968240 sec - 5,552,838,131 cycles # 2.815 GHz - 13,804,885,404 instructions # 2.49 insn per cycle - 1.985050205 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11048) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.075641e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.077978e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.077978e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 1.532948 sec + 4,831,457,512 cycles:u # 3.104 GHz (74.88%) + 662,593 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.82%) + 567,155,151 stalled-cycles-backend:u # 11.74% backend cycles idle (74.82%) + 13,833,428,313 instructions:u # 2.86 insn per cycle + # 0.04 stalled cycles per insn (74.86%) + 1.560029589 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11030) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198157309E-004 Relative difference = 2.837296636563793e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/check.exe -p 64 256 1 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.616548e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.640239e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.640239e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.715037 sec - 4,882,460,771 cycles # 2.839 GHz - 12,329,458,000 instructions # 2.53 insn per cycle - 1.726544499 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 9736) (512y: 94) (512z: 0) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198157309E-004 -Relative difference = 2.837296636563793e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/check.exe -p 64 256 1 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.578273e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.592070e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.592070e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.175187 sec - 4,048,706,273 cycles # 1.858 GHz - 6,292,651,416 instructions # 1.55 insn per cycle - 2.189285599 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1497) (512y: 94) (512z: 9019) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198157309E-004 -Relative difference = 2.837296636563793e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt index a196b44ea8..8a7a242de3 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt @@ -1,223 +1,185 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +HASCURAND=hasNoCurand +HASHIPRAND=hasHiprand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_d_inl1_hrd0' +CUDACPP_BUILDDIR='build.avx2_d_inl1_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-02-02_17:02:04 +DATE: 2024-02-08_18:51:59 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/gcheck.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.222290e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.247528e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.250254e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.534425 sec - 2,240,431,596 cycles # 2.919 GHz - 3,496,667,774 instructions # 1.56 insn per cycle - 0.826707626 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/gcheck.exe -p 64 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 7.423206e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.593883e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.594749e+04 ) sec^-1 +MeanMatrixElemValue = ( 3.804675e-02 +- 2.047289e-02 ) GeV^-4 +TOTAL : 0.637916 sec + 1,865,547,239 cycles:u # 2.851 GHz (74.19%) + 2,330,064 stalled-cycles-frontend:u # 0.12% frontend cycles idle (75.04%) + 5,743,309 stalled-cycles-backend:u # 0.31% backend cycles idle (75.53%) + 2,088,439,685 instructions:u # 1.12 insn per cycle + # 0.00 stalled cycles per insn (76.09%) + 0.684161271 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/gcheck.exe -p 2048 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.763970e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.792510e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.793695e+05 ) sec^-1 -MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.308019 sec - 10,639,344,983 cycles # 2.988 GHz - 23,949,660,196 instructions # 2.25 insn per cycle - 3.620397406 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.242800e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.245327e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.245392e+05 ) sec^-1 +MeanMatrixElemValue = ( 6.694853e+01 +- 6.364791e+01 ) GeV^-4 +TOTAL : 7.696446 sec + 25,980,025,319 cycles:u # 3.365 GHz (75.07%) + 3,610,890 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.06%) + 6,327,050 stalled-cycles-backend:u # 0.02% backend cycles idle (75.03%) + 20,725,161,616 instructions:u # 0.80 insn per cycle + # 0.00 stalled cycles per insn (75.05%) + 7.747812070 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 6.626675e-04 -Avg ME (F77/CUDA) = 6.6266731198158122E-004 -Relative difference = 2.837296513854949e-07 +Avg ME (F77/CUDA) = 6.6266731198158101E-004 +Relative difference = 2.837296517127185e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/check.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.364131e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.364607e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.364607e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 37.593227 sec - 113,059,409,327 cycles # 3.008 GHz - 141,522,513,699 instructions # 1.25 insn per cycle - 37.598042584 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:21365) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 4.523005e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.523398e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.523398e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 36.267689 sec + 126,724,659,584 cycles:u # 3.492 GHz (75.00%) + 103,201,545 stalled-cycles-frontend:u # 0.08% frontend cycles idle (75.00%) + 16,889,689,256 stalled-cycles-backend:u # 13.33% backend cycles idle (75.00%) + 141,470,243,970 instructions:u # 1.12 insn per cycle + # 0.12 stalled cycles per insn (75.00%) + 36.292327891 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:21543) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198140461E-004 Relative difference = 2.8372991790910424e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/check.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.165748e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.168296e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.168296e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 5.190386 sec - 14,938,107,907 cycles # 2.876 GHz - 37,533,627,548 instructions # 2.51 insn per cycle - 5.195435855 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.659008e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.661448e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.661448e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 4.490054 sec + 15,779,015,944 cycles:u # 3.497 GHz (75.00%) + 2,135,494 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.00%) + 7,458,486,693 stalled-cycles-backend:u # 47.27% backend cycles idle (75.00%) + 37,532,441,528 instructions:u # 2.38 insn per cycle + # 0.20 stalled cycles per insn (75.00%) + 4.515783744 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:68052) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141220E-004 Relative difference = 2.837299064562788e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/check.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.601505e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.615927e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.615927e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.167544 sec - 6,037,441,239 cycles # 2.780 GHz - 12,947,499,501 instructions # 2.14 insn per cycle - 2.172600421 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:46593) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 7.591587e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.601950e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.601950e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 2.168280 sec + 7,642,041,008 cycles:u # 3.488 GHz (74.70%) + 1,070,869 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.88%) + 4,390,924,971 stalled-cycles-backend:u # 57.46% backend cycles idle (75.17%) + 12,967,898,753 instructions:u # 1.70 insn per cycle + # 0.34 stalled cycles per insn (75.18%) + 2.194145921 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:46575) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198156778E-004 Relative difference = 2.837296716733571e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/check.exe -p 64 256 1 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.341482e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.363063e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.363063e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.765698 sec - 4,994,170,946 cycles # 2.822 GHz - 11,364,035,735 instructions # 2.28 insn per cycle - 1.770642053 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:40158) (512y: 279) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/runTest.exe -[ PASSED ] 6 tests. +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198156778E-004 -Relative difference = 2.837296716733571e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd0/check.exe -p 64 256 1 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.768561e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.783807e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.783807e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.121383 sec - 3,898,623,942 cycles # 1.834 GHz - 5,853,939,217 instructions # 1.50 insn per cycle - 2.126336750 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2112) (512y: 142) (512z:39211) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198156789E-004 -Relative difference = 2.837296715097453e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt index 71aae0e2ac..cef84849ff 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt @@ -1,223 +1,185 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +HASCURAND=hasNoCurand +HASHIPRAND=hasHiprand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_d_inl1_hrd1' +CUDACPP_BUILDDIR='build.avx2_d_inl1_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-02-02_17:03:13 +DATE: 2024-02-08_18:53:09 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/gcheck.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.242331e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.266988e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.269106e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.532032 sec - 2,253,251,540 cycles # 2.936 GHz - 3,479,836,083 instructions # 1.54 insn per cycle - 0.824975830 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/gcheck.exe -p 64 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 1.396830e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.450949e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.451460e+05 ) sec^-1 +MeanMatrixElemValue = ( 3.804675e-02 +- 2.047289e-02 ) GeV^-4 +TOTAL : 0.521706 sec + 1,481,318,938 cycles:u # 2.748 GHz (74.81%) + 2,169,944 stalled-cycles-frontend:u # 0.15% frontend cycles idle (75.29%) + 5,423,573 stalled-cycles-backend:u # 0.37% backend cycles idle (76.12%) + 1,790,069,205 instructions:u # 1.21 insn per cycle + # 0.00 stalled cycles per insn (76.17%) + 0.566631028 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/gcheck.exe -p 2048 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.794982e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.824044e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.825260e+05 ) sec^-1 -MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.277375 sec - 10,526,717,320 cycles # 2.981 GHz - 21,686,213,398 instructions # 2.06 insn per cycle - 3.590863885 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.738741e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.743652e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.743775e+05 ) sec^-1 +MeanMatrixElemValue = ( 6.694853e+01 +- 6.364791e+01 ) GeV^-4 +TOTAL : 6.347524 sec + 21,797,324,711 cycles:u # 3.416 GHz (74.97%) + 3,038,624 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.98%) + 4,838,637 stalled-cycles-backend:u # 0.02% backend cycles idle (74.98%) + 17,450,782,853 instructions:u # 0.80 insn per cycle + # 0.00 stalled cycles per insn (75.10%) + 6.406516717 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 6.626675e-04 -Avg ME (F77/CUDA) = 6.6266731198158122E-004 -Relative difference = 2.837296513854949e-07 +Avg ME (F77/CUDA) = 6.6266731198158101E-004 +Relative difference = 2.837296517127185e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/check.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.323417e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.323914e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.323914e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 37.948391 sec - 114,134,067,378 cycles # 3.008 GHz - 141,699,321,617 instructions # 1.24 insn per cycle - 37.953563744 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:21615) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 4.563502e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.563872e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.563872e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 35.944992 sec + 126,053,509,205 cycles:u # 3.505 GHz (75.00%) + 11,395,374 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.00%) + 18,435,255,585 stalled-cycles-backend:u # 14.62% backend cycles idle (75.00%) + 141,645,144,485 instructions:u # 1.12 insn per cycle + # 0.13 stalled cycles per insn (75.00%) + 35.969683395 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:21831) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198140461E-004 Relative difference = 2.8372991790910424e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/check.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.218340e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.220966e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.220966e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 5.105573 sec - 14,891,133,850 cycles # 2.914 GHz - 37,592,704,265 instructions # 2.52 insn per cycle - 5.111064391 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.631101e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.633522e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.633522e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 4.524338 sec + 15,911,871,415 cycles:u # 3.500 GHz (74.97%) + 5,229,595 stalled-cycles-frontend:u # 0.03% frontend cycles idle (75.02%) + 7,656,219,059 stalled-cycles-backend:u # 48.12% backend cycles idle (75.02%) + 37,566,202,173 instructions:u # 2.36 insn per cycle + # 0.20 stalled cycles per insn (75.02%) + 4.550369242 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:68056) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141220E-004 Relative difference = 2.837299064562788e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/check.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.875299e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.890872e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.890872e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.092232 sec - 5,936,199,506 cycles # 2.832 GHz - 12,831,019,263 instructions # 2.16 insn per cycle - 2.097300219 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:45663) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 7.703077e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.713803e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.713803e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 2.136816 sec + 7,529,467,375 cycles:u # 3.487 GHz (74.70%) + 1,210,559 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.88%) + 4,270,335,713 stalled-cycles-backend:u # 56.71% backend cycles idle (75.18%) + 12,852,865,982 instructions:u # 1.71 insn per cycle + # 0.33 stalled cycles per insn (75.18%) + 2.162649621 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:45645) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198156778E-004 Relative difference = 2.837296716733571e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/check.exe -p 64 256 1 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.330408e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.351865e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.351865e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.767511 sec - 4,998,448,739 cycles # 2.822 GHz - 11,359,989,955 instructions # 2.27 insn per cycle - 1.772526997 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:39855) (512y: 212) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/runTest.exe -[ PASSED ] 6 tests. +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198156778E-004 -Relative difference = 2.837296716733571e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd1/check.exe -p 64 256 1 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.848173e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.863809e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.863809e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.099900 sec - 3,891,483,141 cycles # 1.850 GHz - 5,843,956,057 instructions # 1.50 insn per cycle - 2.104787726 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1687) (512y: 116) (512z:38946) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198156789E-004 -Relative difference = 2.837296715097453e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt index 206c292560..db75b7c23c 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt @@ -1,223 +1,185 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +HASCURAND=hasNoCurand +HASHIPRAND=hasHiprand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-02-02_16:40:00 +DATE: 2024-02-08_18:25:33 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.317917e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.379313e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.386616e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 -TOTAL : 0.490185 sec - 2,016,313,308 cycles # 2.850 GHz - 2,918,365,205 instructions # 1.45 insn per cycle - 0.793343693 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 2.603654e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.780655e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.781379e+05 ) sec^-1 +MeanMatrixElemValue = ( 7.535870e-02 +- 4.279978e-02 ) GeV^-4 +TOTAL : 0.440792 sec + 1,154,541,552 cycles:u # 2.519 GHz (74.24%) + 2,198,707 stalled-cycles-frontend:u # 0.19% frontend cycles idle (74.83%) + 4,451,469 stalled-cycles-backend:u # 0.39% backend cycles idle (75.14%) + 1,580,717,351 instructions:u # 1.37 insn per cycle + # 0.00 stalled cycles per insn (75.33%) + 0.489512861 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.543056e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.632494e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.636263e+05 ) sec^-1 -MeanMatrixElemValue = ( 6.664703e+00 +- 5.072736e+00 ) GeV^-4 -TOTAL : 1.727744 sec - 5,864,921,285 cycles # 2.984 GHz - 11,778,131,765 instructions # 2.01 insn per cycle - 2.022340436 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.700797e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.727360e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.727823e+05 ) sec^-1 +MeanMatrixElemValue = ( 6.763201e+01 +- 6.205778e+01 ) GeV^-4 +TOTAL : 2.641675 sec + 7,997,770,380 cycles:u # 3.004 GHz (74.83%) + 2,555,358 stalled-cycles-frontend:u # 0.03% frontend cycles idle (74.88%) + 4,924,419 stalled-cycles-backend:u # 0.06% backend cycles idle (75.08%) + 6,831,619,214 instructions:u # 0.85 insn per cycle + # 0.00 stalled cycles per insn (75.05%) + 2.687486810 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 6.626454e-04 -Avg ME (F77/CUDA) = 6.6262659968156085E-004 -Relative difference = 2.8371612387547027e-05 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 6.626791e-04 +Avg ME (F77/CUDA) = 6.6270899361878938E-004 +Relative difference = 4.511024836808726e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.036500e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.037538e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.037538e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 -TOTAL : 8.063001 sec - 24,206,017,725 cycles # 3.001 GHz - 75,876,966,036 instructions # 3.13 insn per cycle - 8.070029497 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.258730e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.259735e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.259735e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.208458e-01 +- 3.253446e-01 ) GeV^-4 +TOTAL : 7.267483 sec + 23,372,915,591 cycles:u # 3.206 GHz (74.98%) + 1,342,333 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.98%) + 2,729,199,994 stalled-cycles-backend:u # 11.68% backend cycles idle (74.98%) + 75,832,248,218 instructions:u # 3.24 insn per cycle + # 0.04 stalled cycles per insn (74.98%) + 7.293005862 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 3898) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627487e-04 -Avg ME (F77/C++) = 6.6274870439686495E-004 -Relative difference = 6.634286759220428e-09 +Avg ME (F77/C++) = 6.6274866115424713E-004 +Relative difference = 5.861309557415831e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.462042e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.476020e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.476020e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.060119e+00 +- 2.367901e+00 ) GeV^-4 -TOTAL : 2.206445 sec - 6,488,895,466 cycles # 2.935 GHz - 20,115,222,341 instructions # 3.10 insn per cycle - 2.217555356 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 7.243925e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.254539e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.254539e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.208459e-01 +- 3.253446e-01 ) GeV^-4 +TOTAL : 2.271192 sec + 6,771,611,711 cycles:u # 2.948 GHz (74.97%) + 1,269,186 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.93%) + 1,159,889,931 stalled-cycles-backend:u # 17.13% backend cycles idle (74.93%) + 20,147,624,321 instructions:u # 2.98 insn per cycle + # 0.06 stalled cycles per insn (74.96%) + 2.305102688 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:13237) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627485e-04 -Avg ME (F77/C++) = 6.6274853360924479E-004 -Relative difference = 5.071191384964548e-08 +Avg ME (F77/C++) = 6.6274845946848876E-004 +Relative difference = 6.115670001294808e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.669374e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.676510e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.676510e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.991572 sec - 2,820,891,180 cycles # 2.832 GHz - 7,038,348,899 instructions # 2.50 insn per cycle - 1.003372796 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11604) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.074907e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.083888e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.083888e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.214980e-01 +- 3.255523e-01 ) GeV^-4 +TOTAL : 0.797255 sec + 2,486,142,212 cycles:u # 3.028 GHz (74.67%) + 651,261 stalled-cycles-frontend:u # 0.03% frontend cycles idle (74.67%) + 237,286,614 stalled-cycles-backend:u # 9.54% backend cycles idle (74.74%) + 7,086,981,766 instructions:u # 2.85 insn per cycle + # 0.03 stalled cycles per insn (74.92%) + 0.824650441 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11586) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627193e-04 -Avg ME (F77/C++) = 6.6271927529261421E-004 -Relative difference = 3.728182620967159e-08 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe -p 64 256 1 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.900266e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.908892e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.908892e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.872018 sec - 2,479,495,985 cycles # 2.829 GHz - 6,280,559,463 instructions # 2.53 insn per cycle - 0.883776981 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10320) (512y: 50) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627193e-04 -Avg ME (F77/C++) = 6.6271927529261421E-004 -Relative difference = 3.728182620967159e-08 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627195e-04 +Avg ME (F77/C++) = 6.6271947045332125E-004 +Relative difference = 4.4583988847766445e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe -p 64 256 1 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.513458e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.519205e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.519205e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 -TOTAL : 1.092713 sec - 2,036,976,484 cycles # 1.857 GHz - 3,248,646,655 instructions # 1.59 insn per cycle - 1.104780481 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2165) (512y: 48) (512z: 9219) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627195e-04 -Avg ME (F77/C++) = 6.6271952818273971E-004 -Relative difference = 4.252589469696448e-08 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt index 51ad5a831f..fffca73371 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt @@ -1,240 +1,196 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +HASCURAND=hasNoCurand +HASHIPRAND=hasHiprand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-02-02_17:13:51 +DATE: 2024-02-08_19:11:45 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.631214e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.334260e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.334260e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.048178e+00 +- 2.364571e+00 ) GeV^-4 -TOTAL : 0.468416 sec - 2,030,077,074 cycles # 2.931 GHz - 2,985,155,777 instructions # 1.47 insn per cycle - 0.750551422 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 2.585947e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.760142e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.760142e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.202335e-01 +- 3.251521e-01 ) GeV^-4 +TOTAL : 0.444317 sec + 1,232,861,961 cycles:u # 2.651 GHz (74.23%) + 3,052,215 stalled-cycles-frontend:u # 0.25% frontend cycles idle (74.82%) + 50,695,897 stalled-cycles-backend:u # 4.11% backend cycles idle (76.49%) + 1,628,289,331 instructions:u # 1.32 insn per cycle + # 0.03 stalled cycles per insn (76.75%) + 0.490521428 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.250631e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.489671e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.489671e+05 ) sec^-1 -MeanMatrixElemValue = ( 6.641710e+00 +- 4.994249e+00 ) GeV^-4 -TOTAL : 1.898956 sec - 6,377,372,257 cycles # 2.987 GHz - 13,506,737,979 instructions # 2.12 insn per cycle - 2.194643390 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.269703e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.713043e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.713043e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.213799e+02 +- 1.195366e+02 ) GeV^-4 +TOTAL : 3.440256 sec + 11,490,476,382 cycles:u # 3.308 GHz (74.90%) + 38,895,273 stalled-cycles-frontend:u # 0.34% frontend cycles idle (74.98%) + 1,129,565,212 stalled-cycles-backend:u # 9.83% backend cycles idle (75.15%) + 9,919,336,188 instructions:u # 0.86 insn per cycle + # 0.11 stalled cycles per insn (75.16%) + 3.499527101 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 6.626454e-04 -Avg ME (F77/CUDA) = 6.6262659968156085E-004 -Relative difference = 2.8371612387547027e-05 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 6.626791e-04 +Avg ME (F77/CUDA) = 6.6270899361878938E-004 +Relative difference = 4.511024836808726e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.042608e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.043634e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.043634e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 -TOTAL : 8.040623 sec - 24,222,293,839 cycles # 3.011 GHz - 75,880,608,860 instructions # 3.13 insn per cycle - 8.045752213 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.472987e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.474035e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.474035e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.208458e-01 +- 3.253446e-01 ) GeV^-4 +TOTAL : 6.639693 sec + 23,332,846,989 cycles:u # 3.502 GHz (74.98%) + 1,288,164 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.02%) + 2,737,407,629 stalled-cycles-backend:u # 11.73% backend cycles idle (75.02%) + 75,874,614,222 instructions:u # 3.25 insn per cycle + # 0.04 stalled cycles per insn (75.03%) + 6.664462835 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 3898) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627487e-04 -Avg ME (F77/C++) = 6.6274870439686495E-004 -Relative difference = 6.634286759220428e-09 +Avg ME (F77/C++) = 6.6274866115424713E-004 +Relative difference = 5.861309557415831e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.360246e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.374729e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.374729e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.060119e+00 +- 2.367901e+00 ) GeV^-4 -TOTAL : 2.241786 sec - 6,512,660,808 cycles # 2.902 GHz - 20,124,093,324 instructions # 3.09 insn per cycle - 2.246769039 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 9.951853e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.969495e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.969495e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.208459e-01 +- 3.253446e-01 ) GeV^-4 +TOTAL : 1.656113 sec + 5,836,586,779 cycles:u # 3.477 GHz (74.78%) + 717,450 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.90%) + 848,310,772 stalled-cycles-backend:u # 14.53% backend cycles idle (75.11%) + 20,140,599,240 instructions:u # 3.45 insn per cycle + # 0.04 stalled cycles per insn (75.23%) + 1.682172413 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:13237) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627485e-04 -Avg ME (F77/C++) = 6.6274853360924479E-004 -Relative difference = 5.071191384964548e-08 +Avg ME (F77/C++) = 6.6274845946848876E-004 +Relative difference = 6.115670001294808e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.664861e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.672126e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.672126e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.996235 sec - 2,826,684,180 cycles # 2.826 GHz - 7,046,884,926 instructions # 2.49 insn per cycle - 1.001186445 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11604) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627193e-04 -Avg ME (F77/C++) = 6.6271927529261421E-004 -Relative difference = 3.728182620967159e-08 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.876461e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.885617e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.885617e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.885271 sec - 2,497,914,751 cycles # 2.809 GHz - 6,289,049,441 instructions # 2.52 insn per cycle - 0.890202670 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10320) (512y: 50) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.361989e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.372243e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.372243e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.214980e-01 +- 3.255523e-01 ) GeV^-4 +TOTAL : 0.702439 sec + 2,486,852,526 cycles:u # 3.431 GHz (74.62%) + 558,544 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.62%) + 304,098,180 stalled-cycles-backend:u # 12.23% backend cycles idle (74.71%) + 7,106,951,943 instructions:u # 2.86 insn per cycle + # 0.04 stalled cycles per insn (75.22%) + 0.728850903 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11586) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627193e-04 -Avg ME (F77/C++) = 6.6271927529261421E-004 -Relative difference = 3.728182620967159e-08 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627195e-04 +Avg ME (F77/C++) = 6.6271947045332125E-004 +Relative difference = 4.4583988847766445e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.522385e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.528310e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.528310e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 -TOTAL : 1.088825 sec - 2,043,694,023 cycles # 1.870 GHz - 3,257,570,377 instructions # 1.59 insn per cycle - 1.093702296 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2165) (512y: 48) (512z: 9219) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627195e-04 -Avg ME (F77/C++) = 6.6271952818273971E-004 -Relative difference = 4.252589469696448e-08 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt index 8cf77f7773..1a81f08f91 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt @@ -1,223 +1,182 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +HASCURAND=hasNoCurand +HASHIPRAND=hasHiprand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-02-02_17:25:48 +DATE: 2024-02-08_19:21:56 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --common OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.323117e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.374654e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.379926e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.159397e-01 +- 3.238804e-01 ) GeV^-4 -TOTAL : 0.463884 sec - 1,972,854,664 cycles # 2.934 GHz - 2,970,579,118 instructions # 1.51 insn per cycle - 0.731998195 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --common -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 2.514466e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.765340e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.766977e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.202247e-01 +- 3.251485e-01 ) GeV^-4 +TOTAL : 0.439790 sec + 1,244,201,437 cycles:u # 2.707 GHz (74.39%) + 2,857,674 stalled-cycles-frontend:u # 0.23% frontend cycles idle (73.93%) + 33,629,601 stalled-cycles-backend:u # 2.70% backend cycles idle (74.64%) + 1,575,400,945 instructions:u # 1.27 insn per cycle + # 0.02 stalled cycles per insn (76.06%) + 0.481133771 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --common OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.553046e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.625543e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.628933e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.094367e+02 +- 1.071509e+02 ) GeV^-4 -TOTAL : 1.805972 sec - 6,061,500,102 cycles # 2.982 GHz - 12,310,314,591 instructions # 2.03 insn per cycle - 2.091644106 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.691757e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.728443e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.728888e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.213664e+02 +- 1.195366e+02 ) GeV^-4 +TOTAL : 3.306610 sec + 11,100,655,015 cycles:u # 3.333 GHz (75.02%) + 28,009,471 stalled-cycles-frontend:u # 0.25% frontend cycles idle (75.04%) + 1,145,966,139 stalled-cycles-backend:u # 10.32% backend cycles idle (75.07%) + 9,001,507,576 instructions:u # 0.81 insn per cycle + # 0.13 stalled cycles per insn (75.06%) + 3.353666898 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 6.626454e-04 -Avg ME (F77/CUDA) = 6.6262659968156085E-004 -Relative difference = 2.8371612387547027e-05 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 6.626791e-04 +Avg ME (F77/CUDA) = 6.6270899361878938E-004 +Relative difference = 4.511024836808726e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe -p 64 256 1 --common OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe -p 64 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.022336e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.023344e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.023344e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.476067e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.477134e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.477134e+03 ) sec^-1 MeanMatrixElemValue = ( 4.208458e-01 +- 3.253446e-01 ) GeV^-4 -TOTAL : 8.119195 sec - 24,244,271,861 cycles # 2.987 GHz - 75,879,602,897 instructions # 3.13 insn per cycle - 8.123805803 seconds time elapsed +TOTAL : 6.629405 sec + 23,294,577,294 cycles:u # 3.503 GHz (74.98%) + 1,353,787 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.98%) + 2,744,428,038 stalled-cycles-backend:u # 11.78% backend cycles idle (74.98%) + 75,897,993,100 instructions:u # 3.26 insn per cycle + # 0.04 stalled cycles per insn (74.98%) + 6.652333857 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 3898) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627487e-04 -Avg ME (F77/C++) = 6.6274870439686495E-004 -Relative difference = 6.634286759220428e-09 +Avg ME (F77/C++) = 6.6274866115424713E-004 +Relative difference = 5.861309557415831e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 1 --common OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.406283e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.420917e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.420917e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.208458e-01 +- 3.253446e-01 ) GeV^-4 -TOTAL : 2.223949 sec - 6,505,808,480 cycles # 2.921 GHz - 20,112,760,587 instructions # 3.09 insn per cycle - 2.228603955 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 9.953957e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.971733e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.971733e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.208459e-01 +- 3.253446e-01 ) GeV^-4 +TOTAL : 1.653575 sec + 5,843,689,539 cycles:u # 3.489 GHz (74.72%) + 730,607 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.91%) + 846,579,829 stalled-cycles-backend:u # 14.49% backend cycles idle (75.12%) + 20,140,448,269 instructions:u # 3.45 insn per cycle + # 0.04 stalled cycles per insn (75.17%) + 1.676778995 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:13237) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627485e-04 -Avg ME (F77/C++) = 6.6274853360924479E-004 -Relative difference = 5.071191384964548e-08 +Avg ME (F77/C++) = 6.6274845946848876E-004 +Relative difference = 6.115670001294808e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 1 --common OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.659789e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.666953e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.666953e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.214979e-01 +- 3.255522e-01 ) GeV^-4 -TOTAL : 0.997762 sec - 2,823,075,116 cycles # 2.818 GHz - 7,034,476,103 instructions # 2.49 insn per cycle - 1.002660023 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11604) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627193e-04 -Avg ME (F77/C++) = 6.6271927529261421E-004 -Relative difference = 3.728182620967159e-08 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe -p 64 256 1 --common OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.896724e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.905869e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.905869e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.214979e-01 +- 3.255522e-01 ) GeV^-4 -TOTAL : 0.874348 sec - 2,480,579,012 cycles # 2.825 GHz - 6,275,642,885 instructions # 2.53 insn per cycle - 0.879164184 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10320) (512y: 50) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.352801e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.363073e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.363073e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.214980e-01 +- 3.255523e-01 ) GeV^-4 +TOTAL : 0.703244 sec + 2,498,771,333 cycles:u # 3.446 GHz (74.63%) + 581,092 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.63%) + 306,067,701 stalled-cycles-backend:u # 12.25% backend cycles idle (74.63%) + 7,101,942,909 instructions:u # 2.84 insn per cycle + # 0.04 stalled cycles per insn (74.82%) + 0.726696573 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11586) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627193e-04 -Avg ME (F77/C++) = 6.6271927529261421E-004 -Relative difference = 3.728182620967159e-08 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627195e-04 +Avg ME (F77/C++) = 6.6271947045332125E-004 +Relative difference = 4.4583988847766445e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe -p 64 256 1 --common OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.501120e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.506981e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.506981e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.214981e-01 +- 3.255523e-01 ) GeV^-4 -TOTAL : 1.102776 sec - 2,039,833,705 cycles # 1.844 GHz - 3,246,168,937 instructions # 1.59 insn per cycle - 1.107482039 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2165) (512y: 48) (512z: 9219) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627195e-04 -Avg ME (F77/C++) = 6.6271952818273971E-004 -Relative difference = 4.252589469696448e-08 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_curhst.txt index 52bc217491..6fdba85723 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_curhst.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_curhst.txt @@ -6,8 +6,9 @@ AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' @@ -41,7 +42,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-02-02_17:22:24 +DATE: 2024-02-05_22:07:20 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +51,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.352716e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.405014e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.410507e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.355863e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.408384e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.413572e+05 ) sec^-1 MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 -TOTAL : 0.462117 sec - 1,979,113,014 cycles # 2.942 GHz - 2,921,001,590 instructions # 1.48 insn per cycle - 0.730506942 seconds time elapsed +TOTAL : 0.459733 sec + 2,017,638,225 cycles # 3.004 GHz + 3,053,099,345 instructions # 1.51 insn per cycle + 0.729037153 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --curhst WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -68,14 +69,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.572535e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.646667e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.650037e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.567086e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.640820e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.644053e+05 ) sec^-1 MeanMatrixElemValue = ( 6.664703e+00 +- 5.072736e+00 ) GeV^-4 -TOTAL : 1.748717 sec - 5,908,738,221 cycles # 2.990 GHz - 12,795,759,812 instructions # 2.17 insn per cycle - 2.033581770 seconds time elapsed +TOTAL : 1.746231 sec + 6,038,476,497 cycles # 3.062 GHz + 12,908,245,229 instructions # 2.14 insn per cycle + 2.029486292 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 @@ -91,14 +92,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.058055e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.059092e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.059092e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.086272e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.087317e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.087317e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 -TOTAL : 7.978409 sec - 24,222,918,393 cycles # 3.036 GHz - 75,879,677,540 instructions # 3.13 insn per cycle - 7.983394906 seconds time elapsed +TOTAL : 7.869325 sec + 24,205,277,736 cycles # 3.075 GHz + 75,876,375,845 instructions # 3.13 insn per cycle + 7.874126058 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 3898) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe @@ -118,14 +119,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.377230e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.391176e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.391176e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.604059e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.618615e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.618615e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060119e+00 +- 2.367901e+00 ) GeV^-4 -TOTAL : 2.231579 sec - 6,480,537,819 cycles # 2.899 GHz - 20,114,312,086 instructions # 3.10 insn per cycle - 2.236293663 seconds time elapsed +TOTAL : 2.165288 sec + 6,496,211,625 cycles # 2.995 GHz + 20,114,277,451 instructions # 3.10 insn per cycle + 2.170165986 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:13237) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe @@ -145,14 +146,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.597190e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.604027e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.604027e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.714168e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.721433e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.721433e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 1.035584 sec - 2,822,977,150 cycles # 2.716 GHz - 7,037,452,350 instructions # 2.49 insn per cycle - 1.040480309 seconds time elapsed +TOTAL : 0.964960 sec + 2,815,359,642 cycles # 2.907 GHz + 7,036,914,271 instructions # 2.50 insn per cycle + 0.969520380 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:11604) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe @@ -172,14 +173,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.898877e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.907997e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.907997e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.951631e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.961047e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.961047e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.872188 sec - 2,477,217,084 cycles # 2.828 GHz - 6,279,275,313 instructions # 2.53 insn per cycle - 0.877053742 seconds time elapsed +TOTAL : 0.848346 sec + 2,475,916,860 cycles # 2.906 GHz + 6,279,126,359 instructions # 2.54 insn per cycle + 0.852826821 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:10320) (512y: 50) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest.exe @@ -199,14 +200,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.510333e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.516093e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.516093e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.553949e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.559900e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.559900e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 -TOTAL : 1.094615 sec - 2,036,960,778 cycles # 1.855 GHz - 3,247,787,972 instructions # 1.59 insn per cycle - 1.099682664 seconds time elapsed +TOTAL : 1.063833 sec + 2,035,247,459 cycles # 1.907 GHz + 3,247,417,227 instructions # 1.60 insn per cycle + 1.068423622 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2165) (512y: 48) (512z: 9219) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt index 1bdee9128e..b29358c30c 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt @@ -1,226 +1,187 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +HASCURAND=hasNoCurand +HASHIPRAND=hasHiprand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-02-02_17:19:03 +DATE: 2024-02-08_19:18:13 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --rmbhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+MESDEV/none+NAVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.738644e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.374846e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.380150e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.048178e+00 +- 2.364571e+00 ) GeV^-4 -TOTAL : 0.463947 sec - 1,981,044,690 cycles # 2.935 GHz - 3,003,724,131 instructions # 1.52 insn per cycle - 0.732462015 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --rmbhst -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 2.589313e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.760043e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.761719e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.202335e-01 +- 3.251521e-01 ) GeV^-4 +TOTAL : 0.442297 sec + 1,270,659,451 cycles:u # 2.722 GHz (73.76%) + 3,440,593 stalled-cycles-frontend:u # 0.27% frontend cycles idle (74.34%) + 34,153,287 stalled-cycles-backend:u # 2.69% backend cycles idle (74.91%) + 1,612,304,356 instructions:u # 1.27 insn per cycle + # 0.02 stalled cycles per insn (76.39%) + 0.485585677 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+MESDEV/none+NAVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.478245e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.631296e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.634570e+05 ) sec^-1 -MeanMatrixElemValue = ( 6.641710e+00 +- 4.994249e+00 ) GeV^-4 -TOTAL : 1.822950 sec - 6,134,380,342 cycles # 2.990 GHz - 13,046,304,857 instructions # 2.13 insn per cycle - 2.108367566 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.295674e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.723764e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.724198e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.213799e+02 +- 1.195366e+02 ) GeV^-4 +TOTAL : 3.417280 sec + 11,488,321,170 cycles:u # 3.329 GHz (75.04%) + 39,406,344 stalled-cycles-frontend:u # 0.34% frontend cycles idle (75.04%) + 1,148,055,081 stalled-cycles-backend:u # 9.99% backend cycles idle (74.97%) + 9,911,157,892 instructions:u # 0.86 insn per cycle + # 0.12 stalled cycles per insn (74.95%) + 3.471056448 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 6.626454e-04 -Avg ME (F77/CUDA) = 6.6262659968156085E-004 -Relative difference = 2.8371612387547027e-05 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 6.626791e-04 +Avg ME (F77/CUDA) = 6.6270899361878938E-004 +Relative difference = 4.511024836808726e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.038132e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.039160e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.039160e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 -TOTAL : 8.057993 sec - 24,208,031,069 cycles # 3.004 GHz - 75,877,309,450 instructions # 3.13 insn per cycle - 8.062762773 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.471309e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.472359e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.472359e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.208458e-01 +- 3.253446e-01 ) GeV^-4 +TOTAL : 6.642241 sec + 23,339,711,815 cycles:u # 3.502 GHz (74.95%) + 1,451,091 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.00%) + 2,749,255,146 stalled-cycles-backend:u # 11.78% backend cycles idle (75.03%) + 75,875,175,331 instructions:u # 3.25 insn per cycle + # 0.04 stalled cycles per insn (75.03%) + 6.666216415 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 3898) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627487e-04 -Avg ME (F77/C++) = 6.6274870439686495E-004 -Relative difference = 6.634286759220428e-09 +Avg ME (F77/C++) = 6.6274866115424713E-004 +Relative difference = 5.861309557415831e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.369652e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.383196e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.383196e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.060119e+00 +- 2.367901e+00 ) GeV^-4 -TOTAL : 2.234023 sec - 6,502,073,822 cycles # 2.906 GHz - 20,115,555,328 instructions # 3.09 insn per cycle - 2.238859131 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 9.963885e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.981548e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.981548e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.208459e-01 +- 3.253446e-01 ) GeV^-4 +TOTAL : 1.652129 sec + 5,840,229,414 cycles:u # 3.489 GHz (74.71%) + 711,949 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.95%) + 842,473,856 stalled-cycles-backend:u # 14.43% backend cycles idle (75.15%) + 20,139,686,739 instructions:u # 3.45 insn per cycle + # 0.04 stalled cycles per insn (75.16%) + 1.675893194 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:13237) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627485e-04 -Avg ME (F77/C++) = 6.6274853360924479E-004 -Relative difference = 5.071191384964548e-08 +Avg ME (F77/C++) = 6.6274845946848876E-004 +Relative difference = 6.115670001294808e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.663579e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.670600e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.670600e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.994339 sec - 2,817,579,461 cycles # 2.823 GHz - 7,037,046,074 instructions # 2.50 insn per cycle - 0.999147094 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11604) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.358784e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.369131e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.369131e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.214980e-01 +- 3.255523e-01 ) GeV^-4 +TOTAL : 0.701451 sec + 2,496,799,364 cycles:u # 3.451 GHz (74.58%) + 541,349 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.58%) + 303,569,480 stalled-cycles-backend:u # 12.16% backend cycles idle (74.66%) + 7,097,254,370 instructions:u # 2.84 insn per cycle + # 0.04 stalled cycles per insn (75.06%) + 0.725439966 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11586) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627193e-04 -Avg ME (F77/C++) = 6.6271927529261421E-004 -Relative difference = 3.728182620967159e-08 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.901269e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.910382e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.910382e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.871161 sec - 2,477,059,767 cycles # 2.831 GHz - 6,279,143,693 instructions # 2.53 insn per cycle - 0.875916218 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10320) (512y: 50) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627193e-04 -Avg ME (F77/C++) = 6.6271927529261421E-004 -Relative difference = 3.728182620967159e-08 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627195e-04 +Avg ME (F77/C++) = 6.6271947045332125E-004 +Relative difference = 4.4583988847766445e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.515652e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.521393e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.521393e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 -TOTAL : 1.090755 sec - 2,035,184,035 cycles # 1.859 GHz - 3,247,446,640 instructions # 1.60 insn per cycle - 1.095604515 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2165) (512y: 48) (512z: 9219) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627195e-04 -Avg ME (F77/C++) = 6.6271952818273971E-004 -Relative difference = 4.252589469696448e-08 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt index 88808cf8cd..c47054a2d7 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt @@ -1,223 +1,185 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +HASCURAND=hasNoCurand +HASHIPRAND=hasHiprand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd1' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-02-02_16:40:30 +DATE: 2024-02-08_18:26:04 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.318719e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.382224e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.388544e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 -TOTAL : 0.483471 sec - 2,053,900,532 cycles # 2.935 GHz - 3,000,198,761 instructions # 1.46 insn per cycle - 0.786473589 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 2.574080e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.764946e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.765663e+05 ) sec^-1 +MeanMatrixElemValue = ( 7.535870e-02 +- 4.279978e-02 ) GeV^-4 +TOTAL : 0.462133 sec + 1,130,126,390 cycles:u # 2.394 GHz (73.37%) + 2,241,615 stalled-cycles-frontend:u # 0.20% frontend cycles idle (74.40%) + 5,278,042 stalled-cycles-backend:u # 0.47% backend cycles idle (75.67%) + 1,542,456,049 instructions:u # 1.36 insn per cycle + # 0.00 stalled cycles per insn (76.31%) + 0.510089687 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.535569e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.625110e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.628948e+05 ) sec^-1 -MeanMatrixElemValue = ( 6.664703e+00 +- 5.072736e+00 ) GeV^-4 -TOTAL : 1.726125 sec - 5,878,273,990 cycles # 2.999 GHz - 11,738,072,510 instructions # 2.00 insn per cycle - 2.016971433 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.717453e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.744677e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.745205e+05 ) sec^-1 +MeanMatrixElemValue = ( 6.763201e+01 +- 6.205778e+01 ) GeV^-4 +TOTAL : 2.637025 sec + 7,807,862,465 cycles:u # 2.935 GHz (74.88%) + 2,445,734 stalled-cycles-frontend:u # 0.03% frontend cycles idle (74.86%) + 4,656,932 stalled-cycles-backend:u # 0.06% backend cycles idle (74.86%) + 6,756,353,612 instructions:u # 0.87 insn per cycle + # 0.00 stalled cycles per insn (75.05%) + 2.688081893 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 6.626454e-04 -Avg ME (F77/CUDA) = 6.6262659968156085E-004 -Relative difference = 2.8371612387547027e-05 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 6.626791e-04 +Avg ME (F77/CUDA) = 6.6270899361878938E-004 +Relative difference = 4.511024836808726e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/check.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.033206e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.034208e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.034208e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 -TOTAL : 8.078447 sec - 24,231,115,403 cycles # 2.999 GHz - 75,804,621,532 instructions # 3.13 insn per cycle - 8.085698218 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.175682e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.176601e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.176601e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.208458e-01 +- 3.253446e-01 ) GeV^-4 +TOTAL : 7.544535 sec + 23,318,888,360 cycles:u # 3.081 GHz (74.95%) + 1,412,271 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.96%) + 2,230,475,680 stalled-cycles-backend:u # 9.57% backend cycles idle (74.98%) + 75,827,109,693 instructions:u # 3.25 insn per cycle + # 0.03 stalled cycles per insn (75.02%) + 7.570599830 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 3848) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627487e-04 -Avg ME (F77/C++) = 6.6274870430095556E-004 -Relative difference = 6.489572191632735e-09 +Avg ME (F77/C++) = 6.6274866108667618E-004 +Relative difference = 5.871505118544242e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/check.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.464699e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.478811e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.478811e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.060119e+00 +- 2.367901e+00 ) GeV^-4 -TOTAL : 2.206427 sec - 6,493,972,484 cycles # 2.938 GHz - 20,111,156,170 instructions # 3.10 insn per cycle - 2.220582301 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 8.773713e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.789536e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.789536e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.208459e-01 +- 3.253446e-01 ) GeV^-4 +TOTAL : 1.875876 sec + 5,837,657,312 cycles:u # 3.073 GHz (74.62%) + 745,118 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.81%) + 831,661,751 stalled-cycles-backend:u # 14.25% backend cycles idle (75.16%) + 20,114,759,991 instructions:u # 3.45 insn per cycle + # 0.04 stalled cycles per insn (75.16%) + 1.903414736 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:13231) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627485e-04 -Avg ME (F77/C++) = 6.6274853360924479E-004 -Relative difference = 5.071191384964548e-08 +Avg ME (F77/C++) = 6.6274845946848876E-004 +Relative difference = 6.115670001294808e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/check.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.670693e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.677451e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.677451e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.990263 sec - 2,812,362,707 cycles # 2.827 GHz - 7,037,909,772 instructions # 2.50 insn per cycle - 1.006064967 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11587) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.089617e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.098746e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.098746e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.214980e-01 +- 3.255523e-01 ) GeV^-4 +TOTAL : 0.791538 sec + 2,482,634,076 cycles:u # 3.044 GHz (74.62%) + 523,574 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.50%) + 301,565,450 stalled-cycles-backend:u # 12.15% backend cycles idle (74.58%) + 7,085,054,777 instructions:u # 2.85 insn per cycle + # 0.04 stalled cycles per insn (75.01%) + 0.818943329 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11569) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627193e-04 -Avg ME (F77/C++) = 6.6271927529261421E-004 -Relative difference = 3.728182620967159e-08 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/check.exe -p 64 256 1 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.913599e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.922614e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.922614e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.865305 sec - 2,474,670,209 cycles # 2.845 GHz - 6,280,249,125 instructions # 2.54 insn per cycle - 0.881073251 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10302) (512y: 50) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627193e-04 -Avg ME (F77/C++) = 6.6271927529261421E-004 -Relative difference = 3.728182620967159e-08 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627195e-04 +Avg ME (F77/C++) = 6.6271947045332125E-004 +Relative difference = 4.4583988847766445e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/check.exe -p 64 256 1 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.523408e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.529240e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.529240e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 -TOTAL : 1.085153 sec - 2,036,969,620 cycles # 1.869 GHz - 3,247,806,845 instructions # 1.59 insn per cycle - 1.096638091 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2140) (512y: 48) (512z: 9219) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627195e-04 -Avg ME (F77/C++) = 6.6271952818273971E-004 -Relative difference = 4.252589469696448e-08 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt index 706f6dded4..434d8447ad 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt @@ -1,223 +1,185 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +HASCURAND=hasNoCurand +HASHIPRAND=hasHiprand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_f_inl1_hrd0' +CUDACPP_BUILDDIR='build.avx2_f_inl1_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-02-02_17:04:23 +DATE: 2024-02-08_18:54:16 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/gcheck.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.570307e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.616380e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.621825e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 -TOTAL : 0.493040 sec - 2,048,646,960 cycles # 2.850 GHz - 3,033,622,154 instructions # 1.48 insn per cycle - 0.778065801 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/gcheck.exe -p 64 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 2.570012e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.763594e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.765152e+05 ) sec^-1 +MeanMatrixElemValue = ( 7.535870e-02 +- 4.279978e-02 ) GeV^-4 +TOTAL : 0.425460 sec + 1,135,202,481 cycles:u # 2.555 GHz (75.07%) + 2,145,635 stalled-cycles-frontend:u # 0.19% frontend cycles idle (75.04%) + 6,038,783 stalled-cycles-backend:u # 0.53% backend cycles idle (74.71%) + 1,550,757,081 instructions:u # 1.37 insn per cycle + # 0.00 stalled cycles per insn (74.70%) + 0.470654879 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/gcheck.exe -p 2048 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.695270e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.755712e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.758301e+05 ) sec^-1 -MeanMatrixElemValue = ( 6.664703e+00 +- 5.072736e+00 ) GeV^-4 -TOTAL : 1.859894 sec - 6,268,867,779 cycles # 2.989 GHz - 13,449,269,342 instructions # 2.15 insn per cycle - 2.154301292 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.703320e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.731268e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.731699e+05 ) sec^-1 +MeanMatrixElemValue = ( 6.763201e+01 +- 6.205778e+01 ) GeV^-4 +TOTAL : 2.614314 sec + 8,769,750,029 cycles:u # 3.322 GHz (74.93%) + 2,537,311 stalled-cycles-frontend:u # 0.03% frontend cycles idle (75.14%) + 5,667,727 stalled-cycles-backend:u # 0.06% backend cycles idle (75.03%) + 7,386,308,964 instructions:u # 0.84 insn per cycle + # 0.00 stalled cycles per insn (75.03%) + 2.666031947 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 6.626454e-04 -Avg ME (F77/CUDA) = 6.6262660579844562E-004 -Relative difference = 2.836238137986709e-05 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 6.626791e-04 +Avg ME (F77/CUDA) = 6.6270899361878938E-004 +Relative difference = 4.511024836808726e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/check.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.757360e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.758198e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.758198e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.059968e+00 +- 2.367799e+00 ) GeV^-4 -TOTAL : 28.494019 sec - 85,961,926,783 cycles # 3.017 GHz - 133,987,952,834 instructions # 1.56 insn per cycle - 28.498722219 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:16123) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 6.343794e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.344484e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.344484e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.204931e-01 +- 3.252404e-01 ) GeV^-4 +TOTAL : 25.858345 sec + 90,701,070,690 cycles:u # 3.505 GHz (74.99%) + 534,988,208 stalled-cycles-frontend:u # 0.59% frontend cycles idle (74.99%) + 6,752,209,392 stalled-cycles-backend:u # 7.44% backend cycles idle (74.99%) + 134,091,433,811 instructions:u # 1.48 insn per cycle + # 0.05 stalled cycles per insn (74.99%) + 25.882815835 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:16252) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627535e-04 -Avg ME (F77/C++) = 6.6275354356437610E-004 -Relative difference = 6.573239683366044e-08 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627534e-04 +Avg ME (F77/C++) = 6.6275340697351248E-004 +Relative difference = 1.052203199451665e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/check.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.079271e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.092799e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.092799e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.059961e+00 +- 2.367791e+00 ) GeV^-4 -TOTAL : 2.325312 sec - 6,721,105,667 cycles # 2.885 GHz - 19,163,359,526 instructions # 2.85 insn per cycle - 2.330805911 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 8.093528e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.105198e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.105198e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.211992e-01 +- 3.254573e-01 ) GeV^-4 +TOTAL : 2.032623 sec + 7,167,162,470 cycles:u # 3.488 GHz (74.98%) + 6,640,055 stalled-cycles-frontend:u # 0.09% frontend cycles idle (75.09%) + 3,073,494,724 stalled-cycles-backend:u # 42.88% backend cycles idle (75.09%) + 19,179,691,319 instructions:u # 2.68 insn per cycle + # 0.16 stalled cycles per insn (75.09%) + 2.058490331 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:68898) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627486e-04 -Avg ME (F77/C++) = 6.6274859783433532E-004 -Relative difference = 3.2677016209485094e-09 +Avg ME (F77/C++) = 6.6274857053714997E-004 +Relative difference = 4.445554471174176e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/check.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.482015e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.487470e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.487470e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060903e+00 +- 2.367376e+00 ) GeV^-4 -TOTAL : 1.115317 sec - 3,149,691,380 cycles # 2.815 GHz - 6,746,734,096 instructions # 2.14 insn per cycle - 1.120200492 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:48625) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.480499e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.484527e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.484527e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.211846e-01 +- 3.254638e-01 ) GeV^-4 +TOTAL : 1.114030 sec + 3,937,359,669 cycles:u # 3.465 GHz (74.66%) + 599,044 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.71%) + 2,266,035,223 stalled-cycles-backend:u # 57.55% backend cycles idle (75.07%) + 6,767,958,144 instructions:u # 1.72 insn per cycle + # 0.33 stalled cycles per insn (75.37%) + 1.139732435 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:48607) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627272e-04 -Avg ME (F77/C++) = 6.6272724143469353E-004 -Relative difference = 6.252149235286529e-08 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627274e-04 +Avg ME (F77/C++) = 6.6272735722101156E-004 +Relative difference = 6.454990161554483e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/check.exe -p 64 256 1 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.799526e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.807631e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.807631e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060903e+00 +- 2.367376e+00 ) GeV^-4 -TOTAL : 0.920030 sec - 2,605,520,479 cycles # 2.820 GHz - 5,931,112,894 instructions # 2.28 insn per cycle - 0.924895307 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:42219) (512y: 24) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/runTest.exe -[ PASSED ] 6 tests. +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627272e-04 -Avg ME (F77/C++) = 6.6272724143469353E-004 -Relative difference = 6.252149235286529e-08 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/check.exe -p 64 256 1 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.462840e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.468198e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.468198e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060905e+00 +- 2.367377e+00 ) GeV^-4 -TOTAL : 1.129651 sec - 2,048,944,002 cycles # 1.809 GHz - 3,435,895,283 instructions # 1.68 insn per cycle - 1.134622757 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4188) (512y: 9) (512z:44489) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627275e-04 -Avg ME (F77/C++) = 6.6272748295826550E-004 -Relative difference = 2.5714542480216212e-08 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt index d7932de41b..4c84e01f99 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt @@ -1,223 +1,185 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +HASCURAND=hasNoCurand +HASHIPRAND=hasHiprand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_f_inl1_hrd1' +CUDACPP_BUILDDIR='build.avx2_f_inl1_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-02-02_17:05:15 +DATE: 2024-02-08_18:55:05 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/gcheck.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.513346e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.555219e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.559578e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 -TOTAL : 0.489191 sec - 2,075,509,383 cycles # 2.922 GHz - 3,109,466,868 instructions # 1.50 insn per cycle - 0.771470978 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/gcheck.exe -p 64 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 2.568788e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.766183e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.768168e+05 ) sec^-1 +MeanMatrixElemValue = ( 7.535870e-02 +- 4.279978e-02 ) GeV^-4 +TOTAL : 0.423804 sec + 1,150,029,388 cycles:u # 2.592 GHz (75.47%) + 2,123,880 stalled-cycles-frontend:u # 0.18% frontend cycles idle (74.93%) + 5,213,875 stalled-cycles-backend:u # 0.45% backend cycles idle (75.57%) + 1,542,493,709 instructions:u # 1.34 insn per cycle + # 0.00 stalled cycles per insn (75.30%) + 0.472356355 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/gcheck.exe -p 2048 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.687719e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.748091e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.750832e+05 ) sec^-1 -MeanMatrixElemValue = ( 6.664703e+00 +- 5.072736e+00 ) GeV^-4 -TOTAL : 1.860019 sec - 6,224,710,763 cycles # 2.971 GHz - 12,373,955,784 instructions # 1.99 insn per cycle - 2.154016804 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.723431e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.750359e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.750885e+05 ) sec^-1 +MeanMatrixElemValue = ( 6.763201e+01 +- 6.205778e+01 ) GeV^-4 +TOTAL : 2.609005 sec + 8,745,463,980 cycles:u # 3.319 GHz (74.88%) + 2,568,646 stalled-cycles-frontend:u # 0.03% frontend cycles idle (75.12%) + 5,702,914 stalled-cycles-backend:u # 0.07% backend cycles idle (75.13%) + 7,426,953,439 instructions:u # 0.85 insn per cycle + # 0.00 stalled cycles per insn (75.10%) + 2.662163005 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 6.626454e-04 -Avg ME (F77/CUDA) = 6.6262660579844562E-004 -Relative difference = 2.836238137986709e-05 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 6.626791e-04 +Avg ME (F77/CUDA) = 6.6270899361878938E-004 +Relative difference = 4.511024836808726e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/check.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.758573e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.759396e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.759396e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.059968e+00 +- 2.367799e+00 ) GeV^-4 -TOTAL : 28.488106 sec - 85,666,262,535 cycles # 3.008 GHz - 134,121,851,061 instructions # 1.57 insn per cycle - 28.493065302 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:16109) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 6.227875e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.228540e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.228540e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.204931e-01 +- 3.252404e-01 ) GeV^-4 +TOTAL : 26.339306 sec + 92,374,923,336 cycles:u # 3.504 GHz (75.00%) + 438,465,384 stalled-cycles-frontend:u # 0.47% frontend cycles idle (74.99%) + 6,550,802,401 stalled-cycles-backend:u # 7.09% backend cycles idle (74.99%) + 133,996,906,633 instructions:u # 1.45 insn per cycle + # 0.05 stalled cycles per insn (74.99%) + 26.363927936 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:16105) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627536e-04 -Avg ME (F77/C++) = 6.6275357377482830E-004 -Relative difference = 3.95700176737784e-08 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627535e-04 +Avg ME (F77/C++) = 6.6275346486299042E-004 +Relative difference = 5.301670926116898e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/check.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.194442e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.207802e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.207802e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.059961e+00 +- 2.367791e+00 ) GeV^-4 -TOTAL : 2.288171 sec - 6,715,091,832 cycles # 2.930 GHz - 19,223,532,719 instructions # 2.86 insn per cycle - 2.293016101 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 8.448607e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.461579e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.461579e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.211992e-01 +- 3.254573e-01 ) GeV^-4 +TOTAL : 1.947250 sec + 6,863,113,366 cycles:u # 3.484 GHz (74.82%) + 737,553 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.74%) + 3,334,442,234 stalled-cycles-backend:u # 48.58% backend cycles idle (74.95%) + 19,238,579,959 instructions:u # 2.80 insn per cycle + # 0.17 stalled cycles per insn (75.23%) + 1.973159628 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:68882) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627486e-04 -Avg ME (F77/C++) = 6.6274859765498573E-004 -Relative difference = 3.538316437387639e-09 +Avg ME (F77/C++) = 6.6274857044990032E-004 +Relative difference = 4.4587192899226015e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/check.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.516788e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.522581e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.522581e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060903e+00 +- 2.367376e+00 ) GeV^-4 -TOTAL : 1.089727 sec - 3,077,409,483 cycles # 2.814 GHz - 6,686,511,430 instructions # 2.17 insn per cycle - 1.094494891 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:47416) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.500626e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.504745e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.504745e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.211846e-01 +- 3.254638e-01 ) GeV^-4 +TOTAL : 1.099013 sec + 3,884,604,581 cycles:u # 3.463 GHz (75.04%) + 1,235,591 stalled-cycles-frontend:u # 0.03% frontend cycles idle (75.04%) + 2,198,119,202 stalled-cycles-backend:u # 56.59% backend cycles idle (75.05%) + 6,705,713,189 instructions:u # 1.73 insn per cycle + # 0.33 stalled cycles per insn (75.05%) + 1.129463927 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:47398) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627272e-04 -Avg ME (F77/C++) = 6.6272724133897148E-004 -Relative difference = 6.237705578619894e-08 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627274e-04 +Avg ME (F77/C++) = 6.6272735755491807E-004 +Relative difference = 6.404606472340801e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/check.exe -p 64 256 1 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.788141e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.796318e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.796318e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060903e+00 +- 2.367376e+00 ) GeV^-4 -TOTAL : 0.929795 sec - 2,609,743,059 cycles # 2.802 GHz - 5,936,205,182 instructions # 2.27 insn per cycle - 0.934835318 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:41564) (512y: 18) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/runTest.exe -[ PASSED ] 6 tests. +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627272e-04 -Avg ME (F77/C++) = 6.6272724133897148E-004 -Relative difference = 6.237705578619894e-08 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/check.exe -p 64 256 1 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.490514e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.496118e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.496118e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060905e+00 +- 2.367377e+00 ) GeV^-4 -TOTAL : 1.109035 sec - 2,047,105,275 cycles # 1.840 GHz - 3,422,534,037 instructions # 1.67 insn per cycle - 1.113792508 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3375) (512y: 11) (512z:43966) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627275e-04 -Avg ME (F77/C++) = 6.6272749650985591E-004 -Relative difference = 5.26633351741962e-09 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt index 85c739d765..21eb60d64b 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt @@ -1,223 +1,185 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +HASCURAND=hasNoCurand +HASHIPRAND=hasHiprand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-02-02_16:40:59 +DATE: 2024-02-08_18:26:34 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.511605e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.545751e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.548271e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.525663 sec - 2,206,454,375 cycles # 2.905 GHz - 3,400,787,577 instructions # 1.54 insn per cycle - 0.830920843 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 7.383338e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.553589e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.555084e+04 ) sec^-1 +MeanMatrixElemValue = ( 3.804675e-02 +- 2.047289e-02 ) GeV^-4 +TOTAL : 0.663207 sec + 1,784,805,123 cycles:u # 2.621 GHz (73.34%) + 2,371,286 stalled-cycles-frontend:u # 0.13% frontend cycles idle (74.76%) + 5,590,912 stalled-cycles-backend:u # 0.31% backend cycles idle (75.35%) + 2,055,029,836 instructions:u # 1.15 insn per cycle + # 0.00 stalled cycles per insn (75.35%) + 0.710045389 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.121293e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.155517e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.156917e+05 ) sec^-1 -MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.050944 sec - 9,745,214,051 cycles # 2.937 GHz - 21,902,353,915 instructions # 2.25 insn per cycle - 3.375478303 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.248062e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.250593e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.250651e+05 ) sec^-1 +MeanMatrixElemValue = ( 6.694853e+01 +- 6.364791e+01 ) GeV^-4 +TOTAL : 7.718381 sec + 23,348,642,052 cycles:u # 3.022 GHz (74.93%) + 3,266,528 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.04%) + 6,116,139 stalled-cycles-backend:u # 0.03% backend cycles idle (75.06%) + 18,753,652,615 instructions:u # 0.80 insn per cycle + # 0.00 stalled cycles per insn (75.05%) + 7.772673424 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 6.626675e-04 Avg ME (F77/CUDA) = 6.6266732376103494E-004 Relative difference = 2.659538381540814e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/check.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.837176e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.838028e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.838028e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 8.937616 sec - 26,816,433,740 cycles # 3.002 GHz - 82,463,371,522 instructions # 3.08 insn per cycle - 8.945264041 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.898017e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.898749e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.898749e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 8.649331 sec + 26,772,745,603 cycles:u # 3.087 GHz (74.99%) + 57,039,124 stalled-cycles-frontend:u # 0.21% frontend cycles idle (75.00%) + 3,873,693,774 stalled-cycles-backend:u # 14.47% backend cycles idle (75.00%) + 82,448,956,627 instructions:u # 3.08 insn per cycle + # 0.05 stalled cycles per insn (75.00%) + 8.675961875 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 6623) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731406016235E-004 Relative difference = 2.8059296349552523e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/check.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.664472e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.667735e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.667735e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.485186 sec - 12,637,052,128 cycles # 2.815 GHz - 38,538,553,186 instructions # 3.05 insn per cycle - 4.499813895 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.502956e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.507031e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.507031e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 3.650850 sec + 11,298,807,934 cycles:u # 3.075 GHz (74.99%) + 3,701,304 stalled-cycles-frontend:u # 0.03% frontend cycles idle (74.97%) + 1,198,952,375 stalled-cycles-backend:u # 10.61% backend cycles idle (74.97%) + 38,525,344,308 instructions:u # 3.41 insn per cycle + # 0.03 stalled cycles per insn (74.97%) + 3.678443473 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:12755) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266730246908442E-004 Relative difference = 2.98084507782618e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.416066e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.433719e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.433719e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.958633 sec - 5,539,266,832 cycles # 2.822 GHz - 13,583,063,983 instructions # 2.45 insn per cycle - 1.974787179 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10944) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.075722e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.077981e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.077981e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 1.533338 sec + 4,767,692,102 cycles:u # 3.058 GHz (74.84%) + 875,813 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.86%) + 518,076,988 stalled-cycles-backend:u # 10.87% backend cycles idle (74.86%) + 13,617,685,988 instructions:u # 2.86 insn per cycle + # 0.04 stalled cycles per insn (74.73%) + 1.597738771 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10926) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266730409276836E-004 Relative difference = 2.9563428359824236e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/check.exe -p 64 256 1 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.604029e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.627258e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.627258e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.717957 sec - 4,843,685,631 cycles # 2.812 GHz - 12,112,197,569 instructions # 2.50 insn per cycle - 1.734047586 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 9682) (512y: 76) (512z: 0) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266730409276836E-004 -Relative difference = 2.9563428359824236e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/check.exe -p 64 256 1 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.445984e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.460006e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.460006e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.213198 sec - 4,094,933,838 cycles # 1.847 GHz - 6,282,763,113 instructions # 1.53 insn per cycle - 2.227854397 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1528) (512y: 76) (512z: 9010) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266730409276836E-004 -Relative difference = 2.9563428359824236e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt index 8a419bcfa6..d643daf349 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt @@ -1,223 +1,185 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +HASCURAND=hasNoCurand +HASHIPRAND=hasHiprand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_m_inl0_hrd1' +CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-02-02_16:41:37 +DATE: 2024-02-08_18:27:12 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.480549e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.513537e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.515984e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.526274 sec - 2,251,834,982 cycles # 2.940 GHz - 3,456,504,618 instructions # 1.53 insn per cycle - 0.837763569 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 1.384844e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.440698e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.441232e+05 ) sec^-1 +MeanMatrixElemValue = ( 3.804675e-02 +- 2.047289e-02 ) GeV^-4 +TOTAL : 0.532172 sec + 1,450,165,108 cycles:u # 2.635 GHz (73.15%) + 2,344,129 stalled-cycles-frontend:u # 0.16% frontend cycles idle (73.67%) + 4,952,838 stalled-cycles-backend:u # 0.34% backend cycles idle (75.78%) + 1,799,622,271 instructions:u # 1.24 insn per cycle + # 0.00 stalled cycles per insn (75.69%) + 0.578640825 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.135000e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.169282e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.170545e+05 ) sec^-1 -MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.026404 sec - 9,816,860,219 cycles # 2.989 GHz - 20,625,307,668 instructions # 2.10 insn per cycle - 3.341501132 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.732714e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.737520e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.737629e+05 ) sec^-1 +MeanMatrixElemValue = ( 6.694853e+01 +- 6.364791e+01 ) GeV^-4 +TOTAL : 6.386833 sec + 19,706,454,901 cycles:u # 3.074 GHz (74.97%) + 3,130,006 stalled-cycles-frontend:u # 0.02% frontend cycles idle (75.05%) + 5,448,176 stalled-cycles-backend:u # 0.03% backend cycles idle (75.05%) + 15,940,692,639 instructions:u # 0.81 insn per cycle + # 0.00 stalled cycles per insn (75.09%) + 6.438658275 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 6.626675e-04 Avg ME (F77/CUDA) = 6.6266732376103494E-004 Relative difference = 2.659538381540814e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/check.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.836646e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.837505e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.837505e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 8.940763 sec - 26,788,704,891 cycles # 2.995 GHz - 82,360,335,362 instructions # 3.07 insn per cycle - 8.948880836 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.978606e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.979399e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.979399e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 8.297021 sec + 26,214,200,018 cycles:u # 3.151 GHz (75.00%) + 14,108,761 stalled-cycles-frontend:u # 0.05% frontend cycles idle (75.00%) + 3,509,403,049 stalled-cycles-backend:u # 13.39% backend cycles idle (75.00%) + 82,326,114,832 instructions:u # 3.14 insn per cycle + # 0.04 stalled cycles per insn (75.00%) + 8.322924348 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 6491) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731406016235E-004 Relative difference = 2.8059296349552523e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/check.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.658088e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.661492e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.661492e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.494574 sec - 12,655,992,906 cycles # 2.814 GHz - 38,557,304,910 instructions # 3.05 insn per cycle - 4.505034990 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.594292e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.598635e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.598635e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 3.578014 sec + 11,347,535,968 cycles:u # 3.151 GHz (74.93%) + 5,170,678 stalled-cycles-frontend:u # 0.05% frontend cycles idle (74.90%) + 1,362,756,631 stalled-cycles-backend:u # 12.01% backend cycles idle (74.84%) + 38,598,602,630 instructions:u # 3.40 insn per cycle + # 0.04 stalled cycles per insn (74.95%) + 3.605311382 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:12729) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266730246908442E-004 Relative difference = 2.98084507782618e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/check.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.455468e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.473026e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.473026e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.950389 sec - 5,499,335,360 cycles # 2.814 GHz - 13,596,039,431 instructions # 2.47 insn per cycle - 1.964720334 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10926) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.107638e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.110113e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.110113e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 1.488940 sec + 4,782,141,770 cycles:u # 3.162 GHz (74.74%) + 1,620,929 stalled-cycles-frontend:u # 0.03% frontend cycles idle (74.97%) + 473,222,387 stalled-cycles-backend:u # 9.90% backend cycles idle (75.15%) + 13,605,640,779 instructions:u # 2.85 insn per cycle + # 0.03 stalled cycles per insn (75.15%) + 1.516094692 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10908) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266730409276836E-004 Relative difference = 2.9563428359824236e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/check.exe -p 64 256 1 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.616891e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.640790e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.640790e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.715382 sec - 4,835,096,763 cycles # 2.811 GHz - 12,121,623,664 instructions # 2.51 insn per cycle - 1.727585249 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 9659) (512y: 76) (512z: 0) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266730409276836E-004 -Relative difference = 2.9563428359824236e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/check.exe -p 64 256 1 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.487442e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.501592e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.501592e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.201370 sec - 4,089,267,548 cycles # 1.855 GHz - 6,288,818,816 instructions # 1.54 insn per cycle - 2.213711911 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1508) (512y: 76) (512z: 9009) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266730409276836E-004 -Relative difference = 2.9563428359824236e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt index e4a672d47c..c2c8b7de5b 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt @@ -1,223 +1,185 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +HASCURAND=hasNoCurand +HASHIPRAND=hasHiprand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2024-02-02_16:44:02 +DATE: 2024-02-08_18:29:20 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe -p 1 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.063154e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.063540e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.063645e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 2.468695 sec - 8,205,323,951 cycles # 2.993 GHz - 17,048,140,069 instructions # 2.08 insn per cycle - 2.867804718 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe -p 1 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 8.014738e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.080986e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.081072e+01 ) sec^-1 +MeanMatrixElemValue = ( 3.297256e-04 +- 2.011325e-04 ) GeV^-6 +TOTAL : 9.534892 sec + 28,671,498,376 cycles:u # 3.043 GHz (74.91%) + 3,858,221 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.04%) + 6,926,757 stalled-cycles-backend:u # 0.02% backend cycles idle (75.05%) + 22,688,422,982 instructions:u # 0.79 insn per cycle + # 0.00 stalled cycles per insn (75.07%) + 9.598603768 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.258357e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.260571e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.260895e+03 ) sec^-1 -MeanMatrixElemValue = ( 1.856249e-04 +- 8.329951e-05 ) GeV^-6 -TOTAL : 3.991294 sec - 13,000,025,179 cycles # 3.011 GHz - 28,092,793,987 instructions # 2.16 insn per cycle - 4.372517528 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.562860e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.566041e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.566064e+03 ) sec^-1 +MeanMatrixElemValue = ( 7.973353e-04 +- 5.853892e-04 ) GeV^-6 +TOTAL : 8.964154 sec + 27,295,730,208 cycles:u # 3.038 GHz (74.96%) + 3,490,636 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.96%) + 7,315,078 stalled-cycles-backend:u # 0.03% backend cycles idle (74.99%) + 21,720,146,331 instructions:u # 0.80 insn per cycle + # 0.00 stalled cycles per insn (75.01%) + 9.012369847 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 9.872263e-03 -Avg ME (F77/CUDA) = 9.8722595284406640E-003 -Relative difference = 3.5164777671934515e-07 +Avg ME (F77/CUDA) = 9.8722595284406710E-003 +Relative difference = 3.516477760164775e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check.exe -p 1 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.051116e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.051330e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.051330e+01 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 6.560394 sec - 19,010,999,956 cycles # 2.898 GHz - 55,180,778,972 instructions # 2.90 insn per cycle - 6.567268442 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 8.963792e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.964026e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.964026e+01 ) sec^-1 +MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 +TOTAL : 5.892164 sec + 18,219,567,667 cycles:u # 3.080 GHz (74.98%) + 29,863,517 stalled-cycles-frontend:u # 0.16% frontend cycles idle (74.98%) + 2,207,836,365 stalled-cycles-backend:u # 12.12% backend cycles idle (74.98%) + 55,186,598,006 instructions:u # 3.03 insn per cycle + # 0.04 stalled cycles per insn (74.98%) + 5.918128422 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:44874) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285514851E-003 Relative difference = 3.5163655122073967e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check.exe -p 1 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.623416e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.623503e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.623503e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 3.259457 sec - 9,816,874,130 cycles # 3.010 GHz - 27,056,571,682 instructions # 2.76 insn per cycle - 3.274655785 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.981674e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.981786e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.981786e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 +TOTAL : 2.665922 sec + 8,287,182,014 cycles:u # 3.082 GHz (75.05%) + 1,561,165 stalled-cycles-frontend:u # 0.02% frontend cycles idle (75.01%) + 781,453,679 stalled-cycles-backend:u # 9.43% backend cycles idle (75.01%) + 27,098,042,601 instructions:u # 3.27 insn per cycle + # 0.03 stalled cycles per insn (75.01%) + 2.692616653 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:97234) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285514851E-003 Relative difference = 3.5163655122073967e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check.exe -p 1 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.530328e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.530747e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.530747e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.508597 sec - 4,240,820,826 cycles # 2.815 GHz - 9,566,680,835 instructions # 2.26 insn per cycle - 1.521984798 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84279) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 4.713124e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.713709e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.713709e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 +TOTAL : 1.118807 sec + 3,604,878,612 cycles:u # 3.160 GHz (74.85%) + 1,062,079 stalled-cycles-frontend:u # 0.03% frontend cycles idle (74.76%) + 285,760,489 stalled-cycles-backend:u # 7.93% backend cycles idle (74.76%) + 9,607,206,483 instructions:u # 2.67 insn per cycle + # 0.03 stalled cycles per insn (74.82%) + 1.144369218 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84261) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285411531E-003 Relative difference = 3.516375977906115e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check.exe -p 1 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.069612e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.070241e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.070241e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.306374 sec - 3,695,802,939 cycles # 2.825 GHz - 8,451,330,952 instructions # 2.29 insn per cycle - 1.318394195 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:79441) (512y: 90) (512z: 0) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 9.872263e-03 -Avg ME (F77/C++) = 9.8722595285411531E-003 -Relative difference = 3.516375977906115e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check.exe -p 1 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.635001e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.635609e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.635609e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.463169 sec - 2,682,901,553 cycles # 1.834 GHz - 4,249,342,718 instructions # 1.58 insn per cycle - 1.474586471 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2166) (512y: 90) (512z:78318) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 9.872263e-03 -Avg ME (F77/C++) = 9.8722595285411531E-003 -Relative difference = 3.516375977906115e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt index 1437f2e653..561061960f 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt @@ -1,240 +1,196 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +HASCURAND=hasNoCurand +HASHIPRAND=hasHiprand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2024-02-02_17:14:21 +DATE: 2024-02-08_19:12:13 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe -p 1 256 2 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe -p 1 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate device Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256) WARNING! Set grid in Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256) -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.063602e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.064552e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.064552e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 2.383455 sec - 8,113,917,849 cycles # 2.991 GHz - 17,560,291,774 instructions # 2.16 insn per cycle - 2.772628074 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe -p 1 256 1 --bridge -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -WARNING! Instantiate device Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256) -WARNING! Set grid in Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256) -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 8.044322e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.045083e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.045083e+01 ) sec^-1 +MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 +TOTAL : 9.391291 sec + 32,517,683,109 cycles:u # 3.454 GHz (74.98%) + 3,689,842 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.03%) + 7,133,306 stalled-cycles-backend:u # 0.02% backend cycles idle (75.04%) + 25,691,612,283 instructions:u # 0.79 insn per cycle + # 0.00 stalled cycles per insn (75.04%) + 9.440910348 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.200648e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.234494e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.234494e+03 ) sec^-1 -MeanMatrixElemValue = ( 1.856249e-04 +- 8.329951e-05 ) GeV^-6 -TOTAL : 4.000624 sec - 12,963,353,611 cycles # 2.997 GHz - 28,015,281,769 instructions # 2.16 insn per cycle - 4.381993157 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.547888e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.551556e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.551556e+03 ) sec^-1 +MeanMatrixElemValue = ( 1.221264e+00 +- 1.219329e+00 ) GeV^-6 +TOTAL : 8.983518 sec + 31,090,612,402 cycles:u # 3.452 GHz (74.95%) + 4,800,576 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.97%) + 53,604,528 stalled-cycles-backend:u # 0.17% backend cycles idle (75.01%) + 24,535,934,860 instructions:u # 0.79 insn per cycle + # 0.00 stalled cycles per insn (75.06%) + 9.033098812 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 9.872263e-03 -Avg ME (F77/CUDA) = 9.8722595284406640E-003 -Relative difference = 3.5164777671934515e-07 +Avg ME (F77/CUDA) = 9.8722595284406710E-003 +Relative difference = 3.516477760164775e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.249231e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.249467e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.249467e+01 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 6.417820 sec - 18,998,624,409 cycles # 2.959 GHz - 55,180,320,580 instructions # 2.90 insn per cycle - 6.423348381 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.025361e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.025389e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.025389e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 +TOTAL : 5.150974 sec + 18,105,899,310 cycles:u # 3.500 GHz (74.95%) + 25,398,193 stalled-cycles-frontend:u # 0.14% frontend cycles idle (74.95%) + 2,124,566,094 stalled-cycles-backend:u # 11.73% backend cycles idle (74.95%) + 55,245,010,996 instructions:u # 3.05 insn per cycle + # 0.04 stalled cycles per insn (74.96%) + 5.175179432 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:44874) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285514851E-003 Relative difference = 3.5163655122073967e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.634235e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.634331e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.634331e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 3.236503 sec - 9,805,620,813 cycles # 3.026 GHz - 27,055,897,648 instructions # 2.76 insn per cycle - 3.241287649 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.250260e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.250388e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.250388e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 +TOTAL : 2.348263 sec + 8,274,094,109 cycles:u # 3.490 GHz (75.03%) + 1,021,812 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.03%) + 777,449,835 stalled-cycles-backend:u # 9.40% backend cycles idle (75.03%) + 27,051,629,562 instructions:u # 3.27 insn per cycle + # 0.03 stalled cycles per insn (75.03%) + 2.373845370 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:97234) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285514851E-003 Relative difference = 3.5163655122073967e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.541518e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.541965e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.541965e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.498165 sec - 4,241,875,959 cycles # 2.824 GHz - 9,565,098,922 instructions # 2.25 insn per cycle - 1.503106643 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84279) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 9.872263e-03 -Avg ME (F77/C++) = 9.8722595285411531E-003 -Relative difference = 3.516375977906115e-07 -OK (relative difference <= 5E-3) +EvtsPerSec[Rmb+ME] (23) = ( 5.227010e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.227682e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.227682e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 +TOTAL : 1.011912 sec + 3,596,339,890 cycles:u # 3.478 GHz (74.76%) + 1,231,609 stalled-cycles-frontend:u # 0.03% frontend cycles idle (75.09%) + 273,275,118 stalled-cycles-backend:u # 7.60% backend cycles idle (75.25%) + 9,569,402,426 instructions:u # 2.66 insn per cycle + # 0.03 stalled cycles per insn (75.25%) + 1.037151346 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84261) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.886649e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.887252e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.887252e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.366385 sec - 3,713,592,351 cycles # 2.714 GHz - 8,451,672,882 instructions # 2.28 insn per cycle - 1.371290961 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:79441) (512y: 90) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285411531E-003 Relative difference = 3.516375977906115e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.625734e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.626339e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.626339e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.461940 sec - 2,683,330,541 cycles # 1.831 GHz - 4,248,827,784 instructions # 1.58 insn per cycle - 1.466965340 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2166) (512y: 90) (512z:78318) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 9.872263e-03 -Avg ME (F77/C++) = 9.8722595285411531E-003 -Relative difference = 3.516375977906115e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt index b4cec4d1cf..06ffd83976 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt @@ -1,223 +1,185 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +HASCURAND=hasNoCurand +HASHIPRAND=hasHiprand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd1' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2024-02-02_16:45:06 +DATE: 2024-02-08_18:31:06 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/gcheck.exe -p 1 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/gcheck.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.065738e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.066155e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.066340e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 2.455005 sec - 8,060,851,684 cycles # 2.932 GHz - 17,983,054,818 instructions # 2.23 insn per cycle - 2.854412989 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/gcheck.exe -p 1 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 8.206191e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.211569e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.211681e+01 ) sec^-1 +MeanMatrixElemValue = ( 3.297256e-04 +- 2.011325e-04 ) GeV^-6 +TOTAL : 9.291628 sec + 28,876,587,435 cycles:u # 3.107 GHz (74.94%) + 3,733,262 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.95%) + 6,377,437 stalled-cycles-backend:u # 0.02% backend cycles idle (74.97%) + 22,892,391,852 instructions:u # 0.79 insn per cycle + # 0.00 stalled cycles per insn (75.01%) + 9.338653724 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.240165e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.242295e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.242542e+03 ) sec^-1 -MeanMatrixElemValue = ( 1.856249e-04 +- 8.329951e-05 ) GeV^-6 -TOTAL : 3.996298 sec - 13,022,365,494 cycles # 3.013 GHz - 30,533,936,591 instructions # 2.34 insn per cycle - 4.378401983 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.566324e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.569681e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.569722e+03 ) sec^-1 +MeanMatrixElemValue = ( 7.973353e-04 +- 5.853892e-04 ) GeV^-6 +TOTAL : 8.965793 sec + 27,937,142,550 cycles:u # 3.109 GHz (74.98%) + 3,362,502 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.00%) + 7,180,647 stalled-cycles-backend:u # 0.03% backend cycles idle (74.97%) + 22,159,141,344 instructions:u # 0.79 insn per cycle + # 0.00 stalled cycles per insn (74.99%) + 9.012156381 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 9.872263e-03 -Avg ME (F77/CUDA) = 9.8722595284406640E-003 -Relative difference = 3.5164777671934515e-07 +Avg ME (F77/CUDA) = 9.8722595284406710E-003 +Relative difference = 3.516477760164775e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/check.exe -p 1 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.308882e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.309118e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.309118e+01 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 6.374331 sec - 18,904,393,388 cycles # 2.966 GHz - 55,159,178,279 instructions # 2.92 insn per cycle - 6.381101683 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 9.247292e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.247538e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.247538e+01 ) sec^-1 +MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 +TOTAL : 5.714198 sec + 18,040,255,390 cycles:u # 3.145 GHz (74.92%) + 24,591,420 stalled-cycles-frontend:u # 0.14% frontend cycles idle (74.96%) + 2,224,872,655 stalled-cycles-backend:u # 12.33% backend cycles idle (75.02%) + 55,158,939,146 instructions:u # 3.06 insn per cycle + # 0.04 stalled cycles per insn (75.04%) + 5.739565254 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:44747) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285514851E-003 Relative difference = 3.5163655122073967e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/check.exe -p 1 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.634462e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.634566e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.634566e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 3.240556 sec - 9,788,383,999 cycles # 3.020 GHz - 27,064,526,230 instructions # 2.76 insn per cycle - 3.252929348 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.020985e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.021102e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.021102e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 +TOTAL : 2.615679 sec + 8,282,568,893 cycles:u # 3.138 GHz (74.73%) + 978,772 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.72%) + 774,826,118 stalled-cycles-backend:u # 9.35% backend cycles idle (74.99%) + 27,108,501,706 instructions:u # 3.27 insn per cycle + # 0.03 stalled cycles per insn (75.12%) + 2.642868514 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:97230) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285514851E-003 Relative difference = 3.5163655122073967e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/check.exe -p 1 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.550195e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.550639e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.550639e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.495682 sec - 4,229,566,264 cycles # 2.824 GHz - 9,569,440,035 instructions # 2.26 insn per cycle - 1.508955748 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84249) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 4.624971e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.625557e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.625557e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 +TOTAL : 1.143999 sec + 3,648,039,246 cycles:u # 3.125 GHz (74.74%) + 2,783,303 stalled-cycles-frontend:u # 0.08% frontend cycles idle (74.65%) + 339,048,447 stalled-cycles-backend:u # 9.29% backend cycles idle (74.71%) + 9,622,788,226 instructions:u # 2.64 insn per cycle + # 0.04 stalled cycles per insn (74.97%) + 1.171053302 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84231) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285411531E-003 Relative difference = 3.516375977906115e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd1/check.exe -p 1 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.015176e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.015775e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.015775e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.323813 sec - 3,737,768,973 cycles # 2.821 GHz - 8,454,893,429 instructions # 2.26 insn per cycle - 1.339398328 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:79386) (512y: 90) (512z: 0) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 9.872263e-03 -Avg ME (F77/C++) = 9.8722595285411531E-003 -Relative difference = 3.516375977906115e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd1/check.exe -p 1 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.581141e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.581694e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.581694e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.487113 sec - 2,682,355,533 cycles # 1.803 GHz - 4,251,040,741 instructions # 1.58 insn per cycle - 1.502364999 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2130) (512y: 90) (512z:78289) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 9.872263e-03 -Avg ME (F77/C++) = 9.8722595285411531E-003 -Relative difference = 3.516375977906115e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt index 71086fc4f7..6372071c48 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt @@ -1,223 +1,185 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +HASCURAND=hasNoCurand +HASHIPRAND=hasHiprand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2024-02-02_16:46:11 +DATE: 2024-02-08_18:32:49 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe -p 1 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.758998e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.759867e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.760151e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.186984e-05 +- 9.824899e-06 ) GeV^-6 -TOTAL : 1.698744 sec - 5,769,411,699 cycles # 2.969 GHz - 11,729,454,367 instructions # 2.03 insn per cycle - 2.053158065 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe -p 1 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 1.834552e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.837886e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.837919e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.256794e-06 +- 4.775721e-07 ) GeV^-6 +TOTAL : 4.468557 sec + 14,183,679,196 cycles:u # 3.160 GHz (74.85%) + 3,326,136 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.92%) + 6,659,579 stalled-cycles-backend:u # 0.05% backend cycles idle (75.02%) + 11,566,659,019 instructions:u # 0.82 insn per cycle + # 0.00 stalled cycles per insn (75.06%) + 4.518838173 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.331312e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.332088e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.332211e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.856829e-04 +- 8.333435e-05 ) GeV^-6 -TOTAL : 1.909397 sec - 6,550,909,537 cycles # 2.982 GHz - 13,690,235,991 instructions # 2.09 insn per cycle - 2.257029226 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 7.376728e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.395099e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.395255e+03 ) sec^-1 +MeanMatrixElemValue = ( 1.930014e-02 +- 1.363569e-02 ) GeV^-6 +TOTAL : 4.702932 sec + 14,041,255,129 cycles:u # 2.995 GHz (74.91%) + 2,934,354 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.96%) + 7,163,543 stalled-cycles-backend:u # 0.05% backend cycles idle (75.05%) + 11,452,634,634 instructions:u # 0.82 insn per cycle + # 0.00 stalled cycles per insn (74.94%) + 4.750425234 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 9.849636e-03 -Avg ME (F77/CUDA) = 9.8712405367667715E-003 -Relative difference = 0.0021934350433631634 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 9.855155e-03 +Avg ME (F77/CUDA) = 9.8696023209835834E-003 +Relative difference = 0.0014659658811639687 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check.exe -p 1 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.962890e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.963189e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.963189e+01 ) sec^-1 -MeanMatrixElemValue = ( 1.187013e-05 +- 9.825040e-06 ) GeV^-6 -TOTAL : 5.900127 sec - 17,599,023,735 cycles # 2.984 GHz - 51,787,400,595 instructions # 2.94 insn per cycle - 5.907077102 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 9.738728e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.738996e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.738996e+01 ) sec^-1 +MeanMatrixElemValue = ( 4.924324e-03 +- 4.918778e-03 ) GeV^-6 +TOTAL : 5.423944 sec + 16,810,814,055 cycles:u # 3.086 GHz (74.91%) + 12,801,190 stalled-cycles-frontend:u # 0.08% frontend cycles idle (75.03%) + 1,744,694,220 stalled-cycles-backend:u # 10.38% backend cycles idle (75.03%) + 51,792,221,693 instructions:u # 3.08 insn per cycle + # 0.03 stalled cycles per insn (75.03%) + 5.449591756 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:27812) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.847961e-03 -Avg ME (F77/C++) = 9.8479612087330436E-003 -Relative difference = 2.119555946686223e-08 +Avg ME (F77/C++) = 9.8479612087414119E-003 +Relative difference = 2.1196409216982896e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check.exe -p 1 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.523173e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.523601e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.523601e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187013e-05 +- 9.825038e-06 ) GeV^-6 -TOTAL : 1.506846 sec - 4,544,500,367 cycles # 3.012 GHz - 13,760,310,089 instructions # 3.03 insn per cycle - 1.522708024 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.049652e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.050113e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.050113e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.924322e-03 +- 4.918776e-03 ) GeV^-6 +TOTAL : 1.305602 sec + 4,074,876,790 cycles:u # 3.066 GHz (74.80%) + 785,206 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.73%) + 421,906,256 stalled-cycles-backend:u # 10.35% backend cycles idle (74.58%) + 13,824,045,075 instructions:u # 3.39 insn per cycle + # 0.03 stalled cycles per insn (74.88%) + 1.332713075 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:97762) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 9.847955e-03 -Avg ME (F77/C++) = 9.8479546894727158E-003 -Relative difference = 3.1532159158088894e-08 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 9.847957e-03 +Avg ME (F77/C++) = 9.8479574833965355E-003 +Relative difference = 4.9085971470122835e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check.exe -p 1 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.020467e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.022154e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.022154e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187187e-05 +- 9.826763e-06 ) GeV^-6 -TOTAL : 0.762481 sec - 2,141,684,874 cycles # 2.806 GHz - 4,827,332,027 instructions # 2.25 insn per cycle - 0.778191988 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84831) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 9.892973e-03 -Avg ME (F77/C++) = 9.8929728159608508E-003 -Relative difference = 1.8603017364363385e-08 -OK (relative difference <= 5E-3) +EvtsPerSec[Rmb+ME] (23) = ( 9.139850e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.142248e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.142248e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.946830e-03 +- 4.941261e-03 ) GeV^-6 +TOTAL : 0.579538 sec + 1,836,301,682 cycles:u # 3.045 GHz (75.01%) + 952,261 stalled-cycles-frontend:u # 0.05% frontend cycles idle (74.81%) + 158,183,106 stalled-cycles-backend:u # 8.61% backend cycles idle (74.81%) + 4,853,489,425 instructions:u # 2.64 insn per cycle + # 0.03 stalled cycles per insn (74.81%) + 0.606755150 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84813) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check.exe -p 1 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.034088e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.036261e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.036261e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187187e-05 +- 9.826763e-06 ) GeV^-6 -TOTAL : 0.667039 sec - 1,880,791,918 cycles # 2.817 GHz - 4,259,830,745 instructions # 2.26 insn per cycle - 0.680493838 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:80038) (512y: 46) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.892973e-03 -Avg ME (F77/C++) = 9.8929728159608508E-003 -Relative difference = 1.8603017364363385e-08 +Avg ME (F77/C++) = 9.8929728161012351E-003 +Relative difference = 1.8588827066662492e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check.exe -p 1 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.236798e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.239291e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.239291e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187188e-05 +- 9.826770e-06 ) GeV^-6 -TOTAL : 0.740314 sec - 1,353,287,519 cycles # 1.828 GHz - 2,148,999,315 instructions # 1.59 insn per cycle - 0.755231145 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2820) (512y: 44) (512z:78510) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 9.892980e-03 -Avg ME (F77/C++) = 9.8929802670331551E-003 -Relative difference = 2.699218597469717e-08 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt index f824a0aba1..a554157bf3 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt @@ -1,240 +1,196 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +HASCURAND=hasNoCurand +HASHIPRAND=hasHiprand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2024-02-02_17:15:25 +DATE: 2024-02-08_19:13:53 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe -p 1 256 2 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe -p 1 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate device Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256) WARNING! Set grid in Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256) -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.796725e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.798712e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.798712e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187094e-05 +- 9.825664e-06 ) GeV^-6 -TOTAL : 1.603143 sec - 5,604,232,742 cycles # 2.989 GHz - 11,530,893,737 instructions # 2.06 insn per cycle - 1.933884145 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe -p 1 256 1 --bridge -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -WARNING! Instantiate device Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256) -WARNING! Set grid in Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256) -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 1.853224e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.853633e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.853633e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.935145e-03 +- 4.929588e-03 ) GeV^-6 +TOTAL : 4.453332 sec + 15,223,637,685 cycles:u # 3.400 GHz (75.00%) + 2,771,746 stalled-cycles-frontend:u # 0.02% frontend cycles idle (75.02%) + 7,367,332 stalled-cycles-backend:u # 0.05% backend cycles idle (75.01%) + 12,410,544,885 instructions:u # 0.82 insn per cycle + # 0.00 stalled cycles per insn (75.04%) + 4.501474464 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.342890e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.355921e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.355921e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.856441e-04 +- 8.331096e-05 ) GeV^-6 -TOTAL : 1.866823 sec - 6,428,718,880 cycles # 2.997 GHz - 13,921,994,966 instructions # 2.17 insn per cycle - 2.202372822 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 7.369153e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.384951e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.384951e+03 ) sec^-1 +MeanMatrixElemValue = ( 1.258769e+00 +- 1.256832e+00 ) GeV^-6 +TOTAL : 4.655580 sec + 15,991,009,132 cycles:u # 3.419 GHz (74.87%) + 3,807,736 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.99%) + 47,032,900 stalled-cycles-backend:u # 0.29% backend cycles idle (75.03%) + 12,953,793,598 instructions:u # 0.81 insn per cycle + # 0.00 stalled cycles per insn (75.03%) + 4.700563262 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 9.849636e-03 -Avg ME (F77/CUDA) = 9.8712405367667715E-003 -Relative difference = 0.0021934350433631634 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 9.855155e-03 +Avg ME (F77/CUDA) = 9.8696023209835834E-003 +Relative difference = 0.0014659658811639687 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.915319e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.915600e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.915600e+01 ) sec^-1 -MeanMatrixElemValue = ( 1.187013e-05 +- 9.825040e-06 ) GeV^-6 -TOTAL : 5.933868 sec - 17,607,642,840 cycles # 2.966 GHz - 51,787,167,142 instructions # 2.94 insn per cycle - 5.938655489 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.107264e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.107296e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.107296e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.924324e-03 +- 4.918778e-03 ) GeV^-6 +TOTAL : 4.770157 sec + 16,770,929,930 cycles:u # 3.500 GHz (74.96%) + 12,660,822 stalled-cycles-frontend:u # 0.08% frontend cycles idle (74.96%) + 1,893,653,846 stalled-cycles-backend:u # 11.29% backend cycles idle (74.96%) + 51,806,136,580 instructions:u # 3.09 insn per cycle + # 0.04 stalled cycles per insn (74.97%) + 4.794448400 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:27812) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.847961e-03 -Avg ME (F77/C++) = 9.8479612087330436E-003 -Relative difference = 2.119555946686223e-08 +Avg ME (F77/C++) = 9.8479612087414119E-003 +Relative difference = 2.1196409216982896e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.524966e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.525394e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.525394e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187013e-05 +- 9.825038e-06 ) GeV^-6 -TOTAL : 1.503898 sec - 4,539,501,183 cycles # 3.011 GHz - 13,759,142,011 instructions # 3.03 insn per cycle - 1.508931914 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.589146e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.589670e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.589670e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.924322e-03 +- 4.918776e-03 ) GeV^-6 +TOTAL : 1.152176 sec + 4,092,173,704 cycles:u # 3.486 GHz (74.86%) + 897,315 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.79%) + 383,423,485 stalled-cycles-backend:u # 9.37% backend cycles idle (74.83%) + 13,837,928,802 instructions:u # 3.38 insn per cycle + # 0.03 stalled cycles per insn (74.68%) + 1.177600284 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:97762) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 9.847955e-03 -Avg ME (F77/C++) = 9.8479546894727158E-003 -Relative difference = 3.1532159158088894e-08 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 9.847957e-03 +Avg ME (F77/C++) = 9.8479574833965355E-003 +Relative difference = 4.9085971470122835e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.027848e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.029604e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.029604e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187187e-05 +- 9.826763e-06 ) GeV^-6 -TOTAL : 0.757002 sec - 2,139,751,251 cycles # 2.812 GHz - 4,826,850,049 instructions # 2.26 insn per cycle - 0.761925975 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84831) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.033165e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.033435e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.033435e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.946830e-03 +- 4.941261e-03 ) GeV^-6 +TOTAL : 0.512821 sec + 1,850,580,257 cycles:u # 3.460 GHz (74.68%) + 801,661 stalled-cycles-frontend:u # 0.04% frontend cycles idle (74.58%) + 171,065,874 stalled-cycles-backend:u # 9.24% backend cycles idle (74.58%) + 4,888,673,602 instructions:u # 2.64 insn per cycle + # 0.03 stalled cycles per insn (74.21%) + 0.538169231 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84813) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.892973e-03 -Avg ME (F77/C++) = 9.8929728159608508E-003 -Relative difference = 1.8603017364363385e-08 +Avg ME (F77/C++) = 9.8929728161012351E-003 +Relative difference = 1.8588827066662492e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.742537e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.744656e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.744656e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187187e-05 +- 9.826763e-06 ) GeV^-6 -TOTAL : 0.687158 sec - 1,881,504,674 cycles # 2.723 GHz - 4,259,525,697 instructions # 2.26 insn per cycle - 0.691913370 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:80038) (512y: 46) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 9.892973e-03 -Avg ME (F77/C++) = 9.8929728159608508E-003 -Relative difference = 1.8603017364363385e-08 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.237873e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.240227e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.240227e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187188e-05 +- 9.826770e-06 ) GeV^-6 -TOTAL : 0.736183 sec - 1,355,444,012 cycles # 1.832 GHz - 2,148,211,890 instructions # 1.58 insn per cycle - 0.741161893 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2820) (512y: 44) (512z:78510) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 9.892980e-03 -Avg ME (F77/C++) = 9.8929802670331551E-003 -Relative difference = 2.699218597469717e-08 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt index 566b5e74be..cf278eddf4 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt @@ -1,223 +1,185 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +HASCURAND=hasNoCurand +HASHIPRAND=hasHiprand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd1' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2024-02-02_16:46:58 +DATE: 2024-02-08_18:33:57 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/gcheck.exe -p 1 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/gcheck.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.758470e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.759336e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.759689e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.186984e-05 +- 9.824899e-06 ) GeV^-6 -TOTAL : 1.691893 sec - 5,711,520,645 cycles # 2.950 GHz - 11,441,558,901 instructions # 2.00 insn per cycle - 2.040751530 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/gcheck.exe -p 1 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 1.868765e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.872239e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.872288e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.256794e-06 +- 4.775721e-07 ) GeV^-6 +TOTAL : 4.402395 sec + 13,271,493,404 cycles:u # 3.000 GHz (74.88%) + 2,886,684 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.90%) + 6,550,270 stalled-cycles-backend:u # 0.05% backend cycles idle (75.00%) + 10,885,613,237 instructions:u # 0.82 insn per cycle + # 0.00 stalled cycles per insn (75.06%) + 4.450270074 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.326643e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.327428e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.327533e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.856829e-04 +- 8.333435e-05 ) GeV^-6 -TOTAL : 1.909157 sec - 6,565,590,356 cycles # 3.001 GHz - 12,834,571,414 instructions # 1.95 insn per cycle - 2.244417429 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 7.376055e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.391546e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.391643e+03 ) sec^-1 +MeanMatrixElemValue = ( 1.930014e-02 +- 1.363569e-02 ) GeV^-6 +TOTAL : 4.669395 sec + 14,084,174,259 cycles:u # 3.005 GHz (74.91%) + 2,801,474 stalled-cycles-frontend:u # 0.02% frontend cycles idle (75.00%) + 6,533,267 stalled-cycles-backend:u # 0.05% backend cycles idle (75.09%) + 11,524,020,381 instructions:u # 0.82 insn per cycle + # 0.00 stalled cycles per insn (75.10%) + 4.721310793 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 9.849636e-03 -Avg ME (F77/CUDA) = 9.8712405367667715E-003 -Relative difference = 0.0021934350433631634 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 9.855155e-03 +Avg ME (F77/CUDA) = 9.8696023209835834E-003 +Relative difference = 0.0014659658811639687 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/check.exe -p 1 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.959014e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.959293e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.959293e+01 ) sec^-1 -MeanMatrixElemValue = ( 1.187013e-05 +- 9.825040e-06 ) GeV^-6 -TOTAL : 5.910184 sec - 17,701,685,853 cycles # 2.995 GHz - 51,758,718,959 instructions # 2.92 insn per cycle - 5.917186981 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 9.903825e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.904092e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.904092e+01 ) sec^-1 +MeanMatrixElemValue = ( 4.924324e-03 +- 4.918778e-03 ) GeV^-6 +TOTAL : 5.336670 sec + 16,857,784,212 cycles:u # 3.145 GHz (74.93%) + 18,781,556 stalled-cycles-frontend:u # 0.11% frontend cycles idle (74.94%) + 1,691,660,205 stalled-cycles-backend:u # 10.03% backend cycles idle (74.96%) + 51,778,005,120 instructions:u # 3.07 insn per cycle + # 0.03 stalled cycles per insn (75.03%) + 5.362534137 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:27678) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.847961e-03 -Avg ME (F77/C++) = 9.8479612087313262E-003 -Relative difference = 2.1195385077844924e-08 +Avg ME (F77/C++) = 9.8479612087396841E-003 +Relative difference = 2.119623377106246e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/check.exe -p 1 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.537525e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.538012e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.538012e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187013e-05 +- 9.825038e-06 ) GeV^-6 -TOTAL : 1.500942 sec - 4,546,652,891 cycles # 3.026 GHz - 13,758,231,878 instructions # 3.03 insn per cycle - 1.512478440 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.115498e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.115986e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.115986e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.924322e-03 +- 4.918776e-03 ) GeV^-6 +TOTAL : 1.284920 sec + 4,067,296,429 cycles:u # 3.110 GHz (75.00%) + 802,854 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.93%) + 403,620,597 stalled-cycles-backend:u # 9.92% backend cycles idle (74.93%) + 13,801,881,820 instructions:u # 3.39 insn per cycle + # 0.03 stalled cycles per insn (74.93%) + 1.311460175 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:97728) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 9.847955e-03 -Avg ME (F77/C++) = 9.8479546894727158E-003 -Relative difference = 3.1532159158088894e-08 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 9.847957e-03 +Avg ME (F77/C++) = 9.8479574833965355E-003 +Relative difference = 4.9085971470122835e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/check.exe -p 1 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.087390e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.089242e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.089242e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187187e-05 +- 9.826763e-06 ) GeV^-6 -TOTAL : 0.753840 sec - 2,129,748,423 cycles # 2.823 GHz - 4,826,582,246 instructions # 2.27 insn per cycle - 0.766400763 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84793) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 9.892973e-03 -Avg ME (F77/C++) = 9.8929728159608508E-003 -Relative difference = 1.8603017364363385e-08 -OK (relative difference <= 5E-3) +EvtsPerSec[Rmb+ME] (23) = ( 9.225770e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.228365e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.228365e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.946830e-03 +- 4.941261e-03 ) GeV^-6 +TOTAL : 0.573839 sec + 1,834,916,505 cycles:u # 3.073 GHz (74.55%) + 663,425 stalled-cycles-frontend:u # 0.04% frontend cycles idle (74.55%) + 164,551,872 stalled-cycles-backend:u # 8.97% backend cycles idle (74.55%) + 4,875,367,626 instructions:u # 2.66 insn per cycle + # 0.03 stalled cycles per insn (74.79%) + 0.600651055 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84775) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/check.exe -p 1 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.167354e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.169635e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.169635e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187187e-05 +- 9.826763e-06 ) GeV^-6 -TOTAL : 0.655725 sec - 1,855,990,861 cycles # 2.827 GHz - 4,258,946,173 instructions # 2.29 insn per cycle - 0.669691677 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:79978) (512y: 46) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.892973e-03 -Avg ME (F77/C++) = 9.8929728159608508E-003 -Relative difference = 1.8603017364363385e-08 +Avg ME (F77/C++) = 9.8929728161012351E-003 +Relative difference = 1.8588827066662492e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/check.exe -p 1 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.317027e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.319302e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.319302e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187188e-05 +- 9.826770e-06 ) GeV^-6 -TOTAL : 0.730779 sec - 1,353,984,643 cycles # 1.850 GHz - 2,148,002,236 instructions # 1.59 insn per cycle - 0.746272505 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2776) (512y: 44) (512z:78501) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 9.892980e-03 -Avg ME (F77/C++) = 9.8929802670331551E-003 -Relative difference = 2.699218597469717e-08 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt index d5349f1044..fb9c2f0290 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt @@ -1,223 +1,185 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +HASCURAND=hasNoCurand +HASHIPRAND=hasHiprand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2024-02-02_16:47:46 +DATE: 2024-02-08_18:35:03 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/gcheck.exe -p 1 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/gcheck.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.679807e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.680329e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.680553e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 2.208385 sec - 7,483,149,124 cycles # 2.993 GHz - 14,933,253,345 instructions # 2.00 insn per cycle - 2.603387022 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/gcheck.exe -p 1 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 7.648038e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.652945e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.653039e+01 ) sec^-1 +MeanMatrixElemValue = ( 3.297256e-04 +- 2.011325e-04 ) GeV^-6 +TOTAL : 9.695567 sec + 30,205,650,330 cycles:u # 3.109 GHz (74.93%) + 4,340,496 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.96%) + 6,895,406 stalled-cycles-backend:u # 0.02% backend cycles idle (74.98%) + 23,899,524,019 instructions:u # 0.79 insn per cycle + # 0.00 stalled cycles per insn (75.04%) + 9.745066259 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.111287e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.111605e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.111637e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.856249e-04 +- 8.329951e-05 ) GeV^-6 -TOTAL : 3.399866 sec - 11,226,626,046 cycles # 3.010 GHz - 25,016,425,895 instructions # 2.23 insn per cycle - 3.786622150 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.332582e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.335476e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.335510e+03 ) sec^-1 +MeanMatrixElemValue = ( 7.973353e-04 +- 5.853892e-04 ) GeV^-6 +TOTAL : 9.304279 sec + 28,962,713,553 cycles:u # 3.106 GHz (75.01%) + 3,467,787 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.97%) + 7,678,561 stalled-cycles-backend:u # 0.03% backend cycles idle (74.95%) + 23,027,565,140 instructions:u # 0.80 insn per cycle + # 0.00 stalled cycles per insn (74.98%) + 9.351434608 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 9.872263e-03 -Avg ME (F77/CUDA) = 9.8722599015656498E-003 -Relative difference = 3.1385249252060663e-07 +Avg ME (F77/CUDA) = 9.8722599015656533E-003 +Relative difference = 3.138524921691728e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/check.exe -p 1 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.319185e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.319424e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.319424e+01 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 6.355444 sec - 19,249,020,805 cycles # 3.029 GHz - 55,392,387,011 instructions # 2.88 insn per cycle - 6.362842829 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 9.154821e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.155049e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.155049e+01 ) sec^-1 +MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 +TOTAL : 5.759679 sec + 18,254,580,621 cycles:u # 3.157 GHz (74.96%) + 35,463,525 stalled-cycles-frontend:u # 0.19% frontend cycles idle (74.96%) + 2,215,087,047 stalled-cycles-backend:u # 12.13% backend cycles idle (74.96%) + 55,428,774,623 instructions:u # 3.04 insn per cycle + # 0.04 stalled cycles per insn (74.97%) + 5.784252327 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:44898) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595861831675E-003 Relative difference = 3.457988134687711e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/check.exe -p 1 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.591013e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.591102e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.591102e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 3.325639 sec - 9,355,505,290 cycles # 2.813 GHz - 25,875,854,886 instructions # 2.77 insn per cycle - 3.338638053 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.107602e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.107739e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.107739e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 +TOTAL : 2.508053 sec + 7,947,357,636 cycles:u # 3.140 GHz (74.91%) + 1,968,487 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.99%) + 830,230,111 stalled-cycles-backend:u # 10.45% backend cycles idle (75.03%) + 25,895,965,513 instructions:u # 3.26 insn per cycle + # 0.03 stalled cycles per insn (75.03%) + 2.534723739 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:96804) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722594844308162E-003 Relative difference = 3.5610570575237004e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/check.exe -p 1 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.676144e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.676607e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.676607e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 1.443779 sec - 4,067,371,047 cycles # 2.814 GHz - 9,120,300,183 instructions # 2.24 insn per cycle - 1.456849058 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:83820) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 4.972373e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.973056e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.973056e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 +TOTAL : 1.063681 sec + 3,417,815,869 cycles:u # 3.146 GHz (75.03%) + 1,146,039 stalled-cycles-frontend:u # 0.03% frontend cycles idle (74.97%) + 286,110,911 stalled-cycles-backend:u # 8.37% backend cycles idle (74.97%) + 9,138,730,343 instructions:u # 2.67 insn per cycle + # 0.03 stalled cycles per insn (74.97%) + 1.090363703 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:83802) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722594324461913E-003 Relative difference = 3.613714310412983e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/check.exe -p 1 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.281811e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.282504e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.282504e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 1.240763 sec - 3,512,198,674 cycles # 2.825 GHz - 8,030,542,574 instructions # 2.29 insn per cycle - 1.254980519 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:79028) (512y: 70) (512z: 0) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 9.872263e-03 -Avg ME (F77/C++) = 9.8722594324461913E-003 -Relative difference = 3.613714310412983e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/check.exe -p 1 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.714777e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.715391e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.715391e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 1.430270 sec - 2,598,676,401 cycles # 1.815 GHz - 4,076,110,376 instructions # 1.57 insn per cycle - 1.446540030 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1903) (512y: 70) (512z:78042) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 9.872263e-03 -Avg ME (F77/C++) = 9.8722594324461913E-003 -Relative difference = 3.613714310412983e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt index 0ad62a3205..22fe4f03b0 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt @@ -1,223 +1,185 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +HASCURAND=hasNoCurand +HASHIPRAND=hasHiprand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_m_inl0_hrd1' +CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.none_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2024-02-02_16:48:48 +DATE: 2024-02-08_18:36:49 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/gcheck.exe -p 1 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/gcheck.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.682143e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.682672e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.682898e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 2.179054 sec - 7,466,719,755 cycles # 2.985 GHz - 15,111,429,544 instructions # 2.02 insn per cycle - 2.563145571 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/gcheck.exe -p 1 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 7.616335e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.621581e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.621650e+01 ) sec^-1 +MeanMatrixElemValue = ( 3.297256e-04 +- 2.011325e-04 ) GeV^-6 +TOTAL : 9.699058 sec + 29,598,806,729 cycles:u # 3.045 GHz (74.95%) + 3,576,423 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.97%) + 7,747,683 stalled-cycles-backend:u # 0.03% backend cycles idle (74.99%) + 23,405,537,654 instructions:u # 0.79 insn per cycle + # 0.00 stalled cycles per insn (75.04%) + 9.749031776 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.103332e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.103646e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.103680e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.856249e-04 +- 8.329951e-05 ) GeV^-6 -TOTAL : 3.418538 sec - 11,241,539,857 cycles # 3.001 GHz - 25,160,908,754 instructions # 2.24 insn per cycle - 3.802025666 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.344020e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.347210e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.347244e+03 ) sec^-1 +MeanMatrixElemValue = ( 7.973353e-04 +- 5.853892e-04 ) GeV^-6 +TOTAL : 9.276482 sec + 28,308,931,845 cycles:u # 3.045 GHz (74.90%) + 3,414,978 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.90%) + 7,329,285 stalled-cycles-backend:u # 0.03% backend cycles idle (74.99%) + 22,421,621,422 instructions:u # 0.79 insn per cycle + # 0.00 stalled cycles per insn (75.04%) + 9.323740295 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 9.872263e-03 -Avg ME (F77/CUDA) = 9.8722599015656498E-003 -Relative difference = 3.1385249252060663e-07 +Avg ME (F77/CUDA) = 9.8722599015656533E-003 +Relative difference = 3.138524921691728e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/check.exe -p 1 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.015998e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.016214e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.016214e+01 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 6.598703 sec - 19,223,507,563 cycles # 2.912 GHz - 55,419,755,010 instructions # 2.88 insn per cycle - 6.603954215 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 8.988779e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.989048e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.989048e+01 ) sec^-1 +MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 +TOTAL : 5.875618 sec + 18,246,421,245 cycles:u # 3.093 GHz (74.92%) + 26,708,145 stalled-cycles-frontend:u # 0.15% frontend cycles idle (74.93%) + 2,188,827,775 stalled-cycles-backend:u # 12.00% backend cycles idle (75.00%) + 55,417,603,916 instructions:u # 3.04 insn per cycle + # 0.04 stalled cycles per insn (75.05%) + 5.901493860 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:44806) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595861831675E-003 Relative difference = 3.457988134687711e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/check.exe -p 1 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.598145e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.598247e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.598247e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 3.309125 sec - 9,318,345,372 cycles # 2.812 GHz - 25,822,753,657 instructions # 2.77 insn per cycle - 3.319044879 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.109158e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.109282e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.109282e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 +TOTAL : 2.506347 sec + 7,931,256,027 cycles:u # 3.136 GHz (74.96%) + 1,981,078 stalled-cycles-frontend:u # 0.02% frontend cycles idle (75.02%) + 850,969,438 stalled-cycles-backend:u # 10.73% backend cycles idle (75.02%) + 25,831,043,266 instructions:u # 3.26 insn per cycle + # 0.03 stalled cycles per insn (75.02%) + 2.533258870 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:96765) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722594844308162E-003 Relative difference = 3.5610570575237004e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/check.exe -p 1 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.742489e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.743003e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.743003e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 1.416733 sec - 4,002,433,005 cycles # 2.817 GHz - 9,099,583,492 instructions # 2.27 insn per cycle - 1.430189946 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:83378) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 4.969298e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.969935e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.969935e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 +TOTAL : 1.060730 sec + 3,390,200,214 cycles:u # 3.131 GHz (74.89%) + 683,003 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.89%) + 278,832,981 stalled-cycles-backend:u # 8.22% backend cycles idle (74.89%) + 9,099,432,389 instructions:u # 2.68 insn per cycle + # 0.03 stalled cycles per insn (74.92%) + 1.086316685 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:83360) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722594324461913E-003 Relative difference = 3.613714310412983e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/check.exe -p 1 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.307718e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.308353e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.308353e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 1.231375 sec - 3,483,426,257 cycles # 2.819 GHz - 8,010,048,340 instructions # 2.30 insn per cycle - 1.242674618 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:78540) (512y: 70) (512z: 0) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 9.872263e-03 -Avg ME (F77/C++) = 9.8722594324461913E-003 -Relative difference = 3.613714310412983e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/check.exe -p 1 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.744723e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.745346e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.745346e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 1.417295 sec - 2,597,234,439 cycles # 1.827 GHz - 4,065,757,144 instructions # 1.57 insn per cycle - 1.427614764 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1420) (512y: 70) (512z:78026) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 9.872263e-03 -Avg ME (F77/C++) = 9.8722594324461913E-003 -Relative difference = 3.613714310412983e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt index 709aec40c9..0b3887f883 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt @@ -1,223 +1,110 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +HASCURAND=hasNoCurand +HASHIPRAND=hasHiprand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-02-02_16:42:15 +DATE: 2024-02-08_18:27:49 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.737515e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.317994e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.705494e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.447758 sec - 1,947,019,924 cycles # 2.936 GHz - 2,713,730,929 instructions # 1.39 insn per cycle - 0.737714468 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe: Segmentation fault + 678,458,595 cycles:u # 1.950 GHz (76.80%) + 2,096,593 stalled-cycles-frontend:u # 0.31% frontend cycles idle (75.56%) + 4,777,999 stalled-cycles-backend:u # 0.70% backend cycles idle (74.58%) + 1,192,050,273 instructions:u # 1.76 insn per cycle + # 0.00 stalled cycles per insn (75.86%) + 0.419347421 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.224477e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.099697e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.509261e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.602505e+02 +- 2.116328e+02 ) GeV^-2 -TOTAL : 0.534455 sec - 2,254,808,747 cycles # 2.915 GHz - 3,204,461,579 instructions # 1.42 insn per cycle - 0.831966429 seconds time elapsed +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe: Segmentation fault + 914,884,219 cycles:u # 1.986 GHz (73.94%) + 2,206,448 stalled-cycles-frontend:u # 0.24% frontend cycles idle (75.90%) + 4,965,762 stalled-cycles-backend:u # 0.54% backend cycles idle (75.65%) + 1,456,857,476 instructions:u # 1.59 insn per cycle + # 0.00 stalled cycles per insn (72.57%) + 0.486671081 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 1.424749e-01 -Avg ME (F77/CUDA) = 0.14247482467490466 -Relative difference = 5.286902838873106e-07 -OK (relative difference <= 5E-3) +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +Memory access fault by GPU node-4 (Agent handle: 0x6939f20) on address 0x1543b4ed9000. Reason: Unknown. + +Program received signal SIGABRT: Process abort signal. + +Backtrace for this error: +#0 0x15464a241dbf in ??? +#1 0x15464a241d2b in ??? +#2 0x15464a2433e4 in ??? +#3 0x154642714b64 in ??? +#4 0x154642711b38 in ??? +#5 0x1546426cf496 in ??? +#6 0x15464a1db6e9 in ??? +#7 0x15464a30f49e in ??? +#8 0xffffffffffffffff in ??? +Avg ME (C++/CUDA) = +Avg ME (F77/CUDA) = +ERROR! Fortran calculation (F77/CUDA) crashed ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.025993e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.047135e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.047135e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 1.620701 sec - 4,885,240,850 cycles # 3.007 GHz - 13,801,054,581 instructions # 2.83 insn per cycle - 1.627927609 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.049511e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.067308e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.067308e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 +TOTAL : 1.587472 sec + 4,990,333,752 cycles:u # 3.099 GHz (74.56%) + 2,680,458 stalled-cycles-frontend:u # 0.05% frontend cycles idle (74.80%) + 666,047,617 stalled-cycles-backend:u # 13.35% backend cycles idle (75.17%) + 13,821,217,940 instructions:u # 2.77 insn per cycle + # 0.05 stalled cycles per insn (75.17%) + 1.612934376 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1166) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482467499481 -Relative difference = 5.286896511435107e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.858881e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.930488e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.930488e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.904382 sec - 2,569,767,340 cycles # 2.848 GHz - 7,403,958,208 instructions # 2.88 insn per cycle - 0.919313186 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2895) (avx2: 0) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482467499475 -Relative difference = 5.286896515331313e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/check.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.327926e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.549781e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.549781e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.514310 sec - 1,471,568,209 cycles # 2.835 GHz - 3,136,644,690 instructions # 2.13 insn per cycle - 0.524015486 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2890) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482467492595 -Relative difference = 5.286901344678233e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/check.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.737499e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.014382e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.014382e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.461069 sec - 1,312,829,416 cycles # 2.819 GHz - 2,923,462,557 instructions # 2.23 insn per cycle - 0.474824775 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2543) (512y: 93) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482467492595 -Relative difference = 5.286901344678233e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/check.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.608540e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.741116e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.741116e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.652881 sec - 1,267,079,702 cycles # 1.927 GHz - 1,899,986,624 instructions # 1.50 insn per cycle - 0.665206091 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1135) (512y: 62) (512z: 2165) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482467492595 -Relative difference = 5.286901344678233e-07 -OK (relative difference <= 5E-3) -========================================================================= - -TEST COMPLETED +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/runTest.exe +Memory access fault by GPU node-4 (Agent handle: 0x63f5c0) on address 0x1482ae5e9000. Reason: Unknown. diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt index aaaacca6e6..0df8842357 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt @@ -1,240 +1,119 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +HASCURAND=hasNoCurand +HASHIPRAND=hasHiprand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-02-02_17:12:38 +DATE: 2024-02-08_19:10:40 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 10 --bridge OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.531290e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.124049e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.124049e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.474980 sec - 2,003,149,486 cycles # 2.926 GHz - 3,004,961,830 instructions # 1.50 insn per cycle - 0.743885564 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe: Segmentation fault + 728,182,078 cycles:u # 2.092 GHz (75.39%) + 2,800,235 stalled-cycles-frontend:u # 0.38% frontend cycles idle (76.02%) + 39,838,991 stalled-cycles-backend:u # 5.47% backend cycles idle (74.83%) + 1,290,090,050 instructions:u # 1.77 insn per cycle + # 0.03 stalled cycles per insn (73.58%) + 0.375238581 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.225933e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.275329e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.275329e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.602505e+02 +- 2.116328e+02 ) GeV^-2 -TOTAL : 0.754625 sec - 2,972,407,783 cycles # 2.951 GHz - 4,484,386,384 instructions # 1.51 insn per cycle - 1.064565206 seconds time elapsed +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe: Segmentation fault + 2,975,558,509 cycles:u # 2.720 GHz (75.17%) + 30,279,059 stalled-cycles-frontend:u # 1.02% frontend cycles idle (75.00%) + 868,671,193 stalled-cycles-backend:u # 29.19% backend cycles idle (73.95%) + 3,189,244,807 instructions:u # 1.07 insn per cycle + # 0.27 stalled cycles per insn (74.42%) + 1.123357973 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 1.424749e-01 -Avg ME (F77/CUDA) = 0.14247482467490466 -Relative difference = 5.286902838873106e-07 -OK (relative difference <= 5E-3) +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +Memory access fault by GPU node-4 (Agent handle: 0x6939f20) on address 0x15053b5e9000. Reason: Unknown. + +Program received signal SIGABRT: Process abort signal. + +Backtrace for this error: +#0 0x1507d095cdbf in ??? +#1 0x1507d095cd2b in ??? +#2 0x1507d095e3e4 in ??? +#3 0x1507c8e2fb64 in ??? +#4 0x1507c8e2cb38 in ??? +#5 0x1507c8dea496 in ??? +#6 0x1507d08f66e9 in ??? +#7 0x1507d0a2a49e in ??? +#8 0xffffffffffffffff in ??? +Avg ME (C++/CUDA) = +Avg ME (F77/CUDA) = +ERROR! Fortran calculation (F77/CUDA) crashed ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.014783e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.035797e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.035797e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 1.643075 sec - 4,911,861,343 cycles # 2.982 GHz - 13,807,456,119 instructions # 2.81 insn per cycle - 1.648250593 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.178600e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.198668e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.198668e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 +TOTAL : 1.417813 sec + 4,995,418,214 cycles:u # 3.468 GHz (75.01%) + 2,349,825 stalled-cycles-frontend:u # 0.05% frontend cycles idle (75.01%) + 664,541,068 stalled-cycles-backend:u # 13.30% backend cycles idle (75.01%) + 13,818,530,399 instructions:u # 2.77 insn per cycle + # 0.05 stalled cycles per insn (75.02%) + 1.442715547 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1166) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482467499481 -Relative difference = 5.286896511435107e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.961945e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.039242e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.039242e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.865006 sec - 2,599,747,622 cycles # 2.992 GHz - 7,450,144,235 instructions # 2.87 insn per cycle - 0.870288766 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2895) (avx2: 0) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482467499475 -Relative difference = 5.286896515331313e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.265546e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.483151e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.483151e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.530641 sec - 1,507,129,758 cycles # 2.818 GHz - 3,185,041,285 instructions # 2.11 insn per cycle - 0.535552961 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2890) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482467492595 -Relative difference = 5.286901344678233e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.729343e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.012903e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.012903e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.470021 sec - 1,347,536,746 cycles # 2.841 GHz - 2,973,609,171 instructions # 2.21 insn per cycle - 0.475386293 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2543) (512y: 93) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482467492595 -Relative difference = 5.286901344678233e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.570458e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.702972e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.702972e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.668768 sec - 1,302,636,181 cycles # 1.936 GHz - 1,938,985,717 instructions # 1.49 insn per cycle - 0.673942862 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1135) (512y: 62) (512z: 2165) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482467492595 -Relative difference = 5.286901344678233e-07 -OK (relative difference <= 5E-3) -========================================================================= - -TEST COMPLETED +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/runTest.exe +Memory access fault by GPU node-4 (Agent handle: 0x63f5c0) on address 0x147118409000. Reason: Unknown. diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt index def3dbba1c..097c028fef 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt @@ -1,223 +1,110 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +HASCURAND=hasNoCurand +HASHIPRAND=hasHiprand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd1' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.none_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-02-02_16:42:33 +DATE: 2024-02-08_18:28:05 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.645418e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.159475e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.502239e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.450064 sec - 1,945,919,554 cycles # 2.929 GHz - 2,740,533,091 instructions # 1.41 insn per cycle - 0.737109478 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 1 +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/gcheck.exe: Segmentation fault + 698,865,952 cycles:u # 1.944 GHz (73.43%) + 2,244,464 stalled-cycles-frontend:u # 0.32% frontend cycles idle (72.12%) + 4,596,251 stalled-cycles-backend:u # 0.66% backend cycles idle (74.71%) + 1,218,556,671 instructions:u # 1.74 insn per cycle + # 0.00 stalled cycles per insn (76.73%) + 0.390528278 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.239312e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.034785e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.418065e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.602505e+02 +- 2.116328e+02 ) GeV^-2 -TOTAL : 0.532153 sec - 2,256,105,049 cycles # 2.926 GHz - 3,218,876,956 instructions # 1.43 insn per cycle - 0.828964993 seconds time elapsed +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/gcheck.exe: Segmentation fault + 888,738,105 cycles:u # 1.956 GHz (74.02%) + 2,149,629 stalled-cycles-frontend:u # 0.24% frontend cycles idle (75.72%) + 4,210,700 stalled-cycles-backend:u # 0.47% backend cycles idle (75.23%) + 1,445,144,664 instructions:u # 1.63 insn per cycle + # 0.00 stalled cycles per insn (73.05%) + 0.480331343 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 1.424749e-01 -Avg ME (F77/CUDA) = 0.14247482467490466 -Relative difference = 5.286902838873106e-07 -OK (relative difference <= 5E-3) +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 +Memory access fault by GPU node-4 (Agent handle: 0x6939e70) on address 0x149b09bf9000. Reason: Unknown. + +Program received signal SIGABRT: Process abort signal. + +Backtrace for this error: +#0 0x149d9ef6bdbf in ??? +#1 0x149d9ef6bd2b in ??? +#2 0x149d9ef6d3e4 in ??? +#3 0x149d9743eb64 in ??? +#4 0x149d9743bb38 in ??? +#5 0x149d973f9496 in ??? +#6 0x149d9ef056e9 in ??? +#7 0x149d9f03949e in ??? +#8 0xffffffffffffffff in ??? +Avg ME (C++/CUDA) = +Avg ME (F77/CUDA) = +ERROR! Fortran calculation (F77/CUDA) crashed ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.029808e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.050747e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.050747e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 1.612824 sec - 4,877,222,352 cycles # 3.017 GHz - 13,807,484,460 instructions # 2.83 insn per cycle - 1.619569782 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.053182e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.071027e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.071027e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 +TOTAL : 1.581312 sec + 4,982,835,902 cycles:u # 3.107 GHz (74.93%) + 2,582,687 stalled-cycles-frontend:u # 0.05% frontend cycles idle (75.06%) + 861,677,452 stalled-cycles-backend:u # 17.29% backend cycles idle (75.06%) + 13,811,263,039 instructions:u # 2.77 insn per cycle + # 0.06 stalled cycles per insn (75.07%) + 1.606148924 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1161) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482467499481 -Relative difference = 5.286896511435107e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/check.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.992874e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.070792e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.070792e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.843930 sec - 2,562,987,418 cycles # 3.020 GHz - 7,406,975,220 instructions # 2.89 insn per cycle - 0.861130265 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2892) (avx2: 0) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482467499475 -Relative difference = 5.286896515331313e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/check.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.295521e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.508640e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.508640e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.519294 sec - 1,478,874,618 cycles # 2.823 GHz - 3,137,249,390 instructions # 2.12 insn per cycle - 0.531146181 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2875) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482467492595 -Relative difference = 5.286901344678233e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd1/check.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.750339e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.036614e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.036614e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.459538 sec - 1,308,250,750 cycles # 2.817 GHz - 2,925,257,009 instructions # 2.24 insn per cycle - 0.474322768 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2527) (512y: 93) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482467492595 -Relative difference = 5.286901344678233e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd1/check.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.573153e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.702226e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.702226e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.661344 sec - 1,266,430,388 cycles # 1.901 GHz - 1,899,823,871 instructions # 1.50 insn per cycle - 0.676726345 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1118) (512y: 62) (512z: 2165) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482467492595 -Relative difference = 5.286901344678233e-07 -OK (relative difference <= 5E-3) -========================================================================= - -TEST COMPLETED +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/runTest.exe +Memory access fault by GPU node-4 (Agent handle: 0x63e260) on address 0x14a85cf29000. Reason: Unknown. diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt index c860776fa0..a0256eaec7 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt @@ -1,223 +1,110 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +HASCURAND=hasNoCurand +HASHIPRAND=hasHiprand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-02-02_16:42:51 +DATE: 2024-02-08_18:28:20 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.341758e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.190658e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.328439e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.018174e+01 +- 1.429492e+01 ) GeV^-2 -TOTAL : 0.445447 sec - 1,958,931,075 cycles # 2.911 GHz - 2,740,620,768 instructions # 1.40 insn per cycle - 0.747924247 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 167 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe: Segmentation fault + 730,948,845 cycles:u # 2.022 GHz (70.93%) + 2,207,273 stalled-cycles-frontend:u # 0.30% frontend cycles idle (73.75%) + 4,601,417 stalled-cycles-backend:u # 0.63% backend cycles idle (76.40%) + 1,234,849,282 instructions:u # 1.69 insn per cycle + # 0.00 stalled cycles per insn (75.58%) + 0.392194629 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.248405e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.807223e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.955827e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.571361e+02 +- 2.114021e+02 ) GeV^-2 -TOTAL : 0.479084 sec - 2,074,042,546 cycles # 2.938 GHz - 2,942,698,737 instructions # 1.42 insn per cycle - 0.764006157 seconds time elapsed +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe: Segmentation fault + 776,560,467 cycles:u # 1.875 GHz (75.08%) + 2,136,178 stalled-cycles-frontend:u # 0.28% frontend cycles idle (76.59%) + 4,919,267 stalled-cycles-backend:u # 0.63% backend cycles idle (74.22%) + 1,354,534,215 instructions:u # 1.74 insn per cycle + # 0.00 stalled cycles per insn (71.20%) + 0.442976193 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 1.424226e-01 -Avg ME (F77/CUDA) = 0.14247488790821983 -Relative difference = 0.00036713209996037764 -OK (relative difference <= 5E-3) +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Memory access fault by GPU node-4 (Agent handle: 0x6937f40) on address 0x1469473b4000. Reason: Unknown. + +Program received signal SIGABRT: Process abort signal. + +Backtrace for this error: +#0 0x146bdc71cdbf in ??? +#1 0x146bdc71cd2b in ??? +#2 0x146bdc71e3e4 in ??? +#3 0x146bd4befb64 in ??? +#4 0x146bd4becb38 in ??? +#5 0x146bd4baa496 in ??? +#6 0x146bdc6b66e9 in ??? +#7 0x146bdc7ea49e in ??? +#8 0xffffffffffffffff in ??? +Avg ME (C++/CUDA) = +Avg ME (F77/CUDA) = +ERROR! Fortran calculation (F77/CUDA) crashed ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.160350e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.187463e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.187463e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018563e+01 +- 1.429902e+01 ) GeV^-2 -TOTAL : 1.433449 sec - 4,340,603,477 cycles # 3.021 GHz - 12,596,481,304 instructions # 2.90 insn per cycle - 1.440368945 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.289085e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.315926e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.315926e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.945525e+02 +- 1.186197e+02 ) GeV^-2 +TOTAL : 1.293913 sec + 4,128,663,439 cycles:u # 3.138 GHz (74.91%) + 2,492,915 stalled-cycles-frontend:u # 0.06% frontend cycles idle (75.08%) + 255,992,571 stalled-cycles-backend:u # 6.20% backend cycles idle (75.08%) + 12,617,506,411 instructions:u # 3.06 insn per cycle + # 0.02 stalled cycles per insn (75.08%) + 1.318022009 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 773) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424686e-01 -Avg ME (F77/C++) = 0.14246860569653919 -Relative difference = 3.998452420257791e-08 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.161166e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.375054e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.375054e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018563e+01 +- 1.429902e+01 ) GeV^-2 -TOTAL : 0.539052 sec - 1,593,462,745 cycles # 2.934 GHz - 4,246,550,820 instructions # 2.66 insn per cycle - 0.550930699 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 3265) (avx2: 0) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424686e-01 -Avg ME (F77/C++) = 0.14246860808920836 -Relative difference = 5.677888572434963e-08 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.534286e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.217104e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.217104e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.317649 sec - 849,618,352 cycles # 2.636 GHz - 1,915,840,127 instructions # 2.25 insn per cycle - 0.330429505 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3488) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247490815036912 -Relative difference = 5.7205649062398515e-08 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.600909e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.536655e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.536655e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.268659 sec - 778,768,969 cycles # 2.850 GHz - 1,797,759,612 instructions # 2.31 insn per cycle - 0.282543754 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3186) (512y: 15) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247490815036912 -Relative difference = 5.7205649062398515e-08 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.889932e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.403710e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.403710e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018829e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.357871 sec - 719,128,388 cycles # 1.985 GHz - 1,287,763,066 instructions # 1.79 insn per cycle - 0.369697308 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1730) (512y: 24) (512z: 2387) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247490450137867 -Relative difference = 3.159418737238044e-08 -OK (relative difference <= 5E-3) -========================================================================= - -TEST COMPLETED +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/runTest.exe +Memory access fault by GPU node-4 (Agent handle: 0x61d1a0) on address 0x151cb5004000. Reason: Unknown. diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt index df565fa72a..44356807a0 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt @@ -1,240 +1,119 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +HASCURAND=hasNoCurand +HASHIPRAND=hasHiprand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-02-02_17:12:56 +DATE: 2024-02-08_19:10:55 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 10 --bridge OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.620430e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.101475e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.101475e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.017654e+01 +- 1.429184e+01 ) GeV^-2 -TOTAL : 0.453053 sec - 1,943,434,058 cycles # 2.927 GHz - 2,835,452,734 instructions # 1.46 insn per cycle - 0.721678143 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -==PROF== Profiling "sigmaKin": launch__registers_per_thread 167 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe: Segmentation fault + 726,549,981 cycles:u # 2.114 GHz (74.42%) + 2,764,701 stalled-cycles-frontend:u # 0.38% frontend cycles idle (75.23%) + 39,282,855 stalled-cycles-backend:u # 5.41% backend cycles idle (75.39%) + 1,223,427,295 instructions:u # 1.68 insn per cycle + # 0.03 stalled cycles per insn (76.73%) + 0.370654317 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.153539e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.611291e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.611291e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.609942e+02 +- 2.115590e+02 ) GeV^-2 -TOTAL : 0.625966 sec - 2,492,383,641 cycles # 2.900 GHz - 3,795,560,098 instructions # 1.52 insn per cycle - 0.916519854 seconds time elapsed +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe: Segmentation fault + 2,939,481,812 cycles:u # 2.907 GHz (73.26%) + 30,604,267 stalled-cycles-frontend:u # 1.04% frontend cycles idle (74.74%) + 845,721,220 stalled-cycles-backend:u # 28.77% backend cycles idle (76.20%) + 3,069,069,064 instructions:u # 1.04 insn per cycle + # 0.28 stalled cycles per insn (75.74%) + 1.037907236 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 1.424226e-01 -Avg ME (F77/CUDA) = 0.14247488790821983 -Relative difference = 0.00036713209996037764 -OK (relative difference <= 5E-3) +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Memory access fault by GPU node-4 (Agent handle: 0x6937f40) on address 0x1548559fc000. Reason: Unknown. + +Program received signal SIGABRT: Process abort signal. + +Backtrace for this error: +#0 0x154aead64dbf in ??? +#1 0x154aead64d2b in ??? +#2 0x154aead663e4 in ??? +#3 0x154ae3237b64 in ??? +#4 0x154ae3234b38 in ??? +#5 0x154ae31f2496 in ??? +#6 0x154aeacfe6e9 in ??? +#7 0x154aeae3249e in ??? +#8 0xffffffffffffffff in ??? +Avg ME (C++/CUDA) = +Avg ME (F77/CUDA) = +ERROR! Fortran calculation (F77/CUDA) crashed ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.156604e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.183518e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.183518e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018563e+01 +- 1.429902e+01 ) GeV^-2 -TOTAL : 1.440661 sec - 4,354,684,491 cycles # 3.015 GHz - 12,600,636,870 instructions # 2.89 insn per cycle - 1.445611635 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.402501e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.431590e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.431590e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.945525e+02 +- 1.186197e+02 ) GeV^-2 +TOTAL : 1.191704 sec + 4,212,711,584 cycles:u # 3.470 GHz (74.96%) + 2,486,175 stalled-cycles-frontend:u # 0.06% frontend cycles idle (74.96%) + 287,047,288 stalled-cycles-backend:u # 6.81% backend cycles idle (74.96%) + 12,623,625,232 instructions:u # 3.00 insn per cycle + # 0.02 stalled cycles per insn (74.97%) + 1.216006959 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 773) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424686e-01 -Avg ME (F77/C++) = 0.14246860569653919 -Relative difference = 3.998452420257791e-08 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.245594e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.466904e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.466904e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018563e+01 +- 1.429902e+01 ) GeV^-2 -TOTAL : 0.529740 sec - 1,611,855,456 cycles # 3.018 GHz - 4,293,644,343 instructions # 2.66 insn per cycle - 0.534967394 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 3265) (avx2: 0) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424686e-01 -Avg ME (F77/C++) = 0.14246860808920836 -Relative difference = 5.677888572434963e-08 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.930980e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.676464e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.676464e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.300611 sec - 867,796,228 cycles # 2.849 GHz - 1,951,592,917 instructions # 2.25 insn per cycle - 0.305494247 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3488) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247490815036912 -Relative difference = 5.7205649062398515e-08 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.454005e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.360174e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.360174e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.278571 sec - 797,194,918 cycles # 2.821 GHz - 1,833,850,563 instructions # 2.30 insn per cycle - 0.283590989 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3186) (512y: 15) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247490815036912 -Relative difference = 5.7205649062398515e-08 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.869819e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.364451e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.364451e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018829e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.363356 sec - 737,483,077 cycles # 2.007 GHz - 1,329,006,524 instructions # 1.80 insn per cycle - 0.368344130 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1730) (512y: 24) (512z: 2387) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247490450137867 -Relative difference = 3.159418737238044e-08 -OK (relative difference <= 5E-3) -========================================================================= - -TEST COMPLETED +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/runTest.exe +Memory access fault by GPU node-4 (Agent handle: 0x61d1a0) on address 0x14f8c1b54000. Reason: Unknown. diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt index 8e77565e09..3dfd6ebf2e 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt @@ -1,223 +1,110 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +HASCURAND=hasNoCurand +HASHIPRAND=hasHiprand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd1' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.none_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-02-02_16:43:08 +DATE: 2024-02-08_18:28:35 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.351912e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.207345e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.346653e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.018174e+01 +- 1.429492e+01 ) GeV^-2 -TOTAL : 0.441492 sec - 1,928,765,051 cycles # 2.934 GHz - 2,724,267,861 instructions # 1.41 insn per cycle - 0.734317632 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 1 +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 167 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/gcheck.exe: Segmentation fault + 688,551,808 cycles:u # 1.933 GHz (74.89%) + 2,213,305 stalled-cycles-frontend:u # 0.32% frontend cycles idle (75.58%) + 4,609,268 stalled-cycles-backend:u # 0.67% backend cycles idle (77.47%) + 1,197,231,249 instructions:u # 1.74 insn per cycle + # 0.00 stalled cycles per insn (76.19%) + 0.387263140 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.196647e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.772987e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.913733e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.571361e+02 +- 2.114021e+02 ) GeV^-2 -TOTAL : 0.482632 sec - 2,080,113,586 cycles # 2.927 GHz - 2,943,676,679 instructions # 1.42 insn per cycle - 0.769455269 seconds time elapsed +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/gcheck.exe: Segmentation fault + 780,842,440 cycles:u # 1.930 GHz (74.65%) + 2,040,560 stalled-cycles-frontend:u # 0.26% frontend cycles idle (76.81%) + 4,988,745 stalled-cycles-backend:u # 0.64% backend cycles idle (75.02%) + 1,352,770,220 instructions:u # 1.73 insn per cycle + # 0.00 stalled cycles per insn (72.27%) + 0.432239511 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 1.424226e-01 -Avg ME (F77/CUDA) = 0.14247488790821983 -Relative difference = 0.00036713209996037764 -OK (relative difference <= 5E-3) +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 +Memory access fault by GPU node-4 (Agent handle: 0x6937e90) on address 0x14ae22ff4000. Reason: Unknown. + +Program received signal SIGABRT: Process abort signal. + +Backtrace for this error: +#0 0x14b0b835ddbf in ??? +#1 0x14b0b835dd2b in ??? +#2 0x14b0b835f3e4 in ??? +#3 0x14b0b0830b64 in ??? +#4 0x14b0b082db38 in ??? +#5 0x14b0b07eb496 in ??? +#6 0x14b0b82f76e9 in ??? +#7 0x14b0b842b49e in ??? +#8 0xffffffffffffffff in ??? +Avg ME (C++/CUDA) = +Avg ME (F77/CUDA) = +ERROR! Fortran calculation (F77/CUDA) crashed ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.152822e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.180185e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.180185e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018563e+01 +- 1.429902e+01 ) GeV^-2 -TOTAL : 1.442053 sec - 4,373,449,247 cycles # 3.025 GHz - 12,588,405,825 instructions # 2.88 insn per cycle - 1.448913829 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.286257e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.313322e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.313322e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.945525e+02 +- 1.186197e+02 ) GeV^-2 +TOTAL : 1.296760 sec + 4,140,961,015 cycles:u # 3.138 GHz (74.86%) + 2,651,490 stalled-cycles-frontend:u # 0.06% frontend cycles idle (75.12%) + 520,725,746 stalled-cycles-backend:u # 12.57% backend cycles idle (75.15%) + 12,613,422,158 instructions:u # 3.05 insn per cycle + # 0.04 stalled cycles per insn (75.16%) + 1.321948469 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 759) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424686e-01 -Avg ME (F77/C++) = 0.14246860569653919 -Relative difference = 3.998452420257791e-08 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/check.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.271379e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.494324e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.494324e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018563e+01 +- 1.429902e+01 ) GeV^-2 -TOTAL : 0.520653 sec - 1,583,615,731 cycles # 3.015 GHz - 4,241,146,713 instructions # 2.68 insn per cycle - 0.538714337 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 3248) (avx2: 0) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424686e-01 -Avg ME (F77/C++) = 0.14246860808920836 -Relative difference = 5.677888572434963e-08 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/check.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.006848e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.775665e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.775665e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.293009 sec - 845,477,463 cycles # 2.841 GHz - 1,913,866,507 instructions # 2.26 insn per cycle - 0.308184751 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3463) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247490815036912 -Relative difference = 5.7205649062398515e-08 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd1/check.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.569512e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.506062e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.506062e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.270089 sec - 778,113,704 cycles # 2.834 GHz - 1,795,656,010 instructions # 2.31 insn per cycle - 0.281692090 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3164) (512y: 15) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247490815036912 -Relative difference = 5.7205649062398515e-08 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/check.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.863632e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.377276e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.377276e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018829e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.359345 sec - 716,962,783 cycles # 1.971 GHz - 1,286,354,964 instructions # 1.79 insn per cycle - 0.373120866 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1709) (512y: 24) (512z: 2387) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247490450137867 -Relative difference = 3.159418737238044e-08 -OK (relative difference <= 5E-3) -========================================================================= - -TEST COMPLETED +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/runTest.exe +Memory access fault by GPU node-4 (Agent handle: 0x61be40) on address 0x154d7a3a4000. Reason: Unknown. diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt index 302426324d..6f991a88e2 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt @@ -1,223 +1,110 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +HASCURAND=hasNoCurand +HASHIPRAND=hasHiprand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-02-02_16:43:25 +DATE: 2024-02-08_18:28:49 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.682481e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.333814e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.712009e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.450859 sec - 1,930,805,021 cycles # 2.902 GHz - 2,743,205,972 instructions # 1.42 insn per cycle - 0.739904861 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 1 +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/gcheck.exe: Segmentation fault + 678,448,663 cycles:u # 1.984 GHz (75.54%) + 2,155,466 stalled-cycles-frontend:u # 0.32% frontend cycles idle (74.24%) + 5,437,136 stalled-cycles-backend:u # 0.80% backend cycles idle (72.01%) + 1,241,016,939 instructions:u # 1.83 insn per cycle + # 0.00 stalled cycles per insn (75.01%) + 0.390711844 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.318863e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.109707e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.540516e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.602505e+02 +- 2.116328e+02 ) GeV^-2 -TOTAL : 0.538586 sec - 2,294,071,107 cycles # 2.920 GHz - 3,206,997,510 instructions # 1.40 insn per cycle - 0.845861920 seconds time elapsed +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/gcheck.exe: Segmentation fault + 922,080,460 cycles:u # 1.981 GHz (71.78%) + 2,206,869 stalled-cycles-frontend:u # 0.24% frontend cycles idle (74.43%) + 5,156,776 stalled-cycles-backend:u # 0.56% backend cycles idle (76.67%) + 1,354,584,330 instructions:u # 1.47 insn per cycle + # 0.00 stalled cycles per insn (76.01%) + 0.492294582 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 1.424749e-01 -Avg ME (F77/CUDA) = 0.14247482577104625 -Relative difference = 5.209967070245855e-07 -OK (relative difference <= 5E-3) +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 +Memory access fault by GPU node-4 (Agent handle: 0x6939f20) on address 0x14c931589000. Reason: Unknown. + +Program received signal SIGABRT: Process abort signal. + +Backtrace for this error: +#0 0x14cbc68eddbf in ??? +#1 0x14cbc68edd2b in ??? +#2 0x14cbc68ef3e4 in ??? +#3 0x14cbbedc0b64 in ??? +#4 0x14cbbedbdb38 in ??? +#5 0x14cbbed7b496 in ??? +#6 0x14cbc68876e9 in ??? +#7 0x14cbc69bb49e in ??? +#8 0xffffffffffffffff in ??? +Avg ME (C++/CUDA) = +Avg ME (F77/CUDA) = +ERROR! Fortran calculation (F77/CUDA) crashed ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.030561e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.051396e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.051396e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 1.612492 sec - 4,891,974,404 cycles # 3.027 GHz - 13,824,083,542 instructions # 2.83 insn per cycle - 1.619343361 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.063338e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.081332e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.081332e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 +TOTAL : 1.566665 sec + 5,004,100,594 cycles:u # 3.148 GHz (74.84%) + 1,445,039 stalled-cycles-frontend:u # 0.03% frontend cycles idle (74.84%) + 839,191,252 stalled-cycles-backend:u # 16.77% backend cycles idle (74.85%) + 13,856,679,231 instructions:u # 2.77 insn per cycle + # 0.06 stalled cycles per insn (74.92%) + 1.592188452 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1135) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482734618697 -Relative difference = 5.099411406595165e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/check.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.889747e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.962130e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.962130e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.889923 sec - 2,600,006,474 cycles # 2.906 GHz - 7,349,466,762 instructions # 2.83 insn per cycle - 0.902805426 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2967) (avx2: 0) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482734618697 -Relative difference = 5.099411406595165e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/check.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.317788e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.529668e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.529668e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.516136 sec - 1,467,874,255 cycles # 2.820 GHz - 3,084,471,228 instructions # 2.10 insn per cycle - 0.534496669 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3008) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482643254802 -Relative difference = 5.163537715318965e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/check.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.845086e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.143678e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.143678e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.448906 sec - 1,280,119,136 cycles # 2.821 GHz - 2,872,961,625 instructions # 2.24 insn per cycle - 0.463382466 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2653) (512y: 96) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482643254802 -Relative difference = 5.163537715318965e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/check.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.518553e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.643051e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.643051e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.674731 sec - 1,305,558,570 cycles # 1.923 GHz - 1,914,923,523 instructions # 1.47 insn per cycle - 0.686591057 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1493) (512y: 70) (512z: 2164) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482643254802 -Relative difference = 5.163537715318965e-07 -OK (relative difference <= 5E-3) -========================================================================= - -TEST COMPLETED +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/runTest.exe +Memory access fault by GPU node-4 (Agent handle: 0x63f5c0) on address 0x146cb9099000. Reason: Unknown. diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt index 6e14be4837..987ac60c0e 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt @@ -1,223 +1,110 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +HASCURAND=hasNoCurand +HASHIPRAND=hasHiprand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_m_inl0_hrd1' +CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.none_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-02-02_16:43:43 +DATE: 2024-02-08_18:29:05 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.701801e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.169576e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.520195e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.444400 sec - 1,962,279,911 cycles # 2.931 GHz - 2,733,284,017 instructions # 1.39 insn per cycle - 0.743203099 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 1 +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/gcheck.exe: Segmentation fault + 726,128,967 cycles:u # 2.120 GHz (73.37%) + 2,161,283 stalled-cycles-frontend:u # 0.30% frontend cycles idle (74.05%) + 4,554,612 stalled-cycles-backend:u # 0.63% backend cycles idle (77.69%) + 1,200,673,004 instructions:u # 1.65 insn per cycle + # 0.00 stalled cycles per insn (77.69%) + 0.369447796 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.269686e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.952538e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.363390e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.602505e+02 +- 2.116328e+02 ) GeV^-2 -TOTAL : 0.531967 sec - 2,252,628,072 cycles # 2.922 GHz - 3,236,812,236 instructions # 1.44 insn per cycle - 0.829042027 seconds time elapsed +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/gcheck.exe: Segmentation fault + 901,851,871 cycles:u # 1.964 GHz (73.55%) + 2,143,838 stalled-cycles-frontend:u # 0.24% frontend cycles idle (76.11%) + 5,320,407 stalled-cycles-backend:u # 0.59% backend cycles idle (75.63%) + 1,381,245,349 instructions:u # 1.53 insn per cycle + # 0.00 stalled cycles per insn (75.73%) + 0.484540471 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 1.424749e-01 -Avg ME (F77/CUDA) = 0.14247482577104625 -Relative difference = 5.209967070245855e-07 -OK (relative difference <= 5E-3) +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 +Memory access fault by GPU node-4 (Agent handle: 0x6939e70) on address 0x1502ead09000. Reason: Unknown. + +Program received signal SIGABRT: Process abort signal. + +Backtrace for this error: +#0 0x150580071dbf in ??? +#1 0x150580071d2b in ??? +#2 0x1505800733e4 in ??? +#3 0x150578544b64 in ??? +#4 0x150578541b38 in ??? +#5 0x1505784ff496 in ??? +#6 0x15058000b6e9 in ??? +#7 0x15058013f49e in ??? +#8 0xffffffffffffffff in ??? +Avg ME (C++/CUDA) = +Avg ME (F77/CUDA) = +ERROR! Fortran calculation (F77/CUDA) crashed ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.021225e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.042103e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.042103e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 1.626571 sec - 4,899,542,236 cycles # 3.005 GHz - 13,831,314,326 instructions # 2.82 insn per cycle - 1.633827936 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.065407e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.083513e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.083513e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 +TOTAL : 1.563492 sec + 5,002,207,106 cycles:u # 3.151 GHz (74.80%) + 1,731,248 stalled-cycles-frontend:u # 0.03% frontend cycles idle (74.80%) + 789,645,029 stalled-cycles-backend:u # 15.79% backend cycles idle (74.66%) + 13,861,853,774 instructions:u # 2.77 insn per cycle + # 0.06 stalled cycles per insn (74.92%) + 1.589695330 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1130) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482734618697 -Relative difference = 5.099411406595165e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/check.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.963291e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.037994e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.037994e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.856000 sec - 2,600,446,163 cycles # 3.022 GHz - 7,352,465,788 instructions # 2.83 insn per cycle - 0.871835009 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2957) (avx2: 0) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482734618697 -Relative difference = 5.099411406595165e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/check.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.337785e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.557829e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.557829e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.512826 sec - 1,467,845,165 cycles # 2.836 GHz - 3,084,796,320 instructions # 2.10 insn per cycle - 0.524269788 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2986) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482643254802 -Relative difference = 5.163537715318965e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd1/check.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.856557e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.152632e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.152632e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.446631 sec - 1,279,278,871 cycles # 2.835 GHz - 2,875,133,604 instructions # 2.25 insn per cycle - 0.462171075 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2636) (512y: 96) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482643254802 -Relative difference = 5.163537715318965e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd1/check.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.516538e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.638772e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.638772e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.675569 sec - 1,303,481,113 cycles # 1.916 GHz - 1,915,126,954 instructions # 1.47 insn per cycle - 0.689456593 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1476) (512y: 70) (512z: 2164) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482643254802 -Relative difference = 5.163537715318965e-07 -OK (relative difference <= 5E-3) -========================================================================= - -TEST COMPLETED +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/runTest.exe +Memory access fault by GPU node-4 (Agent handle: 0x63e260) on address 0x1546044f9000. Reason: Unknown. diff --git a/epochX/cudacpp/tput/throughputX.sh b/epochX/cudacpp/tput/throughputX.sh index 503d060237..efb282fc58 100755 --- a/epochX/cudacpp/tput/throughputX.sh +++ b/epochX/cudacpp/tput/throughputX.sh @@ -187,7 +187,7 @@ while [ "$1" != "" ]; do elif [ "$1" == "-curhst" ]; then rndgen=" -${1}" shift - elif [ "$1" == "-rorhst" ]; then + elif [ "$1" == "-hirhst" ]; then rndgen=" -${1}" shift elif [ "$1" == "-rmbhst" ]; then