diff --git a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt index 56d4950288..0ab7ae8748 100644 --- a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt @@ -15,12 +15,12 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make USEBUILDDIR=1 AVX=512z make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' @@ -33,7 +33,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2023-07-20_22:07:53 +DATE: 2023-06-16_23:34:34 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum @@ -58,9 +58,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/av [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [] fbridge_mode=0 - [COUNTERS] PROGRAM TOTAL : 0.0543s - [COUNTERS] Fortran Overhead ( 0 ) : 0.0429s - [COUNTERS] Fortran MEs ( 1 ) : 0.0114s for 8192 events => throughput is 7.21E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.0320s + [COUNTERS] Fortran Overhead ( 0 ) : 0.0199s + [COUNTERS] Fortran MEs ( 1 ) : 0.0121s for 8192 events => throughput is 6.78E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,9 +83,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=0 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1753s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1635s - [COUNTERS] Fortran MEs ( 1 ) : 0.0117s for 8192 events => throughput is 6.99E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.1743s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1627s + [COUNTERS] Fortran MEs ( 1 ) : 0.0116s for 8192 events => throughput is 7.07E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -108,9 +108,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x10_fortran > /tmp/a [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0915 [9.1501919904813656E-002] fbridge_mode=0 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.4615s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3334s - [COUNTERS] Fortran MEs ( 1 ) : 0.1280s for 90112 events => throughput is 7.04E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4602s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3321s + [COUNTERS] Fortran MEs ( 1 ) : 0.1281s for 90112 events => throughput is 7.03E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -133,9 +133,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1772s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1714s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0058s for 8192 events => throughput is 1.40E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1857s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1798s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0059s for 8192 events => throughput is 1.39E+06 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -166,9 +166,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0915 [9.1501919904813628E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.4049s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3406s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0643s for 90112 events => throughput is 1.40E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.4020s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3381s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0639s for 90112 events => throughput is 1.41E+06 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -181,12 +181,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.345240e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.336870e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.361173e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.374247e+06 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -209,9 +209,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1810s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1782s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0028s for 8192 events => throughput is 2.93E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1985s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1956s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0028s for 8192 events => throughput is 2.92E+06 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -242,9 +242,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0915 [9.1501919904813628E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.3693s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3375s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0317s for 90112 events => throughput is 2.84E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3681s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3369s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0312s for 90112 events => throughput is 2.89E+06 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -257,12 +257,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.696407e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.728154e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.906298e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.907973e+06 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -285,9 +285,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1683s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1667s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0015s for 8192 events => throughput is 5.30E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1716s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1699s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0017s for 8192 events => throughput is 4.87E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -318,9 +318,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0915 [9.1501919904813656E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.3514s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3341s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0173s for 90112 events => throughput is 5.22E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3546s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3373s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0173s for 90112 events => throughput is 5.21E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -333,12 +333,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.240211e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.040980e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.733985e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.910901e+06 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -361,9 +361,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1692s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1676s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0016s for 8192 events => throughput is 5.14E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1706s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1692s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0014s for 8192 events => throughput is 5.70E+06 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -394,9 +394,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0915 [9.1501919904813656E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.3522s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3359s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0163s for 90112 events => throughput is 5.53E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3592s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3430s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0162s for 90112 events => throughput is 5.56E+06 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -409,12 +409,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.597422e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.443652e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.695526e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.102617e+06 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -437,9 +437,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1717s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1701s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0016s for 8192 events => throughput is 5.02E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1741s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1723s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0019s for 8192 events => throughput is 4.43E+06 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -470,9 +470,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0915 [9.1501919904813656E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.3584s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3398s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0186s for 90112 events => throughput is 4.84E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3577s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3393s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0184s for 90112 events => throughput is 4.89E+06 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -485,12 +485,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.626417e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.745967e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.233368e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.387840e+06 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -513,9 +513,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.6509s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6504s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0005s for 8192 events => throughput is 1.67E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.6235s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6230s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0005s for 8192 events => throughput is 1.65E+07 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -546,9 +546,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0915 [9.1501919904813628E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.7973s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7925s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0048s for 90112 events => throughput is 1.88E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.7970s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7922s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0048s for 90112 events => throughput is 1.87E+07 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** @@ -561,41 +561,41 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.023949e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.108671e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.435051e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.375732e+08 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.509189e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.977014e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.021389e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.036419e+09 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.505707e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.981844e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.070677e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.078134e+09 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.474746e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.944271e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.989292e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.976379e+08 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt index b0f885d159..6e3da73554 100644 --- a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt @@ -2,28 +2,28 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/e CUDACPP_BUILDDIR='.' - make USEBUILDDIR=1 AVX=none -make USEBUILDDIR=1 AVX=sse4 +make USEBUILDDIR=1 AVX=sse4 make USEBUILDDIR=1 AVX=avx2 -make USEBUILDDIR=1 AVX=512y make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' + +make USEBUILDDIR=1 AVX=512y make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. @@ -33,7 +33,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2023-07-20_22:08:10 +DATE: 2023-06-16_23:34:51 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum @@ -58,9 +58,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/av [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [] fbridge_mode=0 - [COUNTERS] PROGRAM TOTAL : 0.0313s + [COUNTERS] PROGRAM TOTAL : 0.0316s [COUNTERS] Fortran Overhead ( 0 ) : 0.0200s - [COUNTERS] Fortran MEs ( 1 ) : 0.0113s for 8192 events => throughput is 7.22E+05 events/s + [COUNTERS] Fortran MEs ( 1 ) : 0.0115s for 8192 events => throughput is 7.11E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,9 +83,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=0 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1741s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1624s - [COUNTERS] Fortran MEs ( 1 ) : 0.0117s for 8192 events => throughput is 6.98E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.1738s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1623s + [COUNTERS] Fortran MEs ( 1 ) : 0.0115s for 8192 events => throughput is 7.12E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -108,9 +108,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x10_fortran > /tmp/a [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0915 [9.1501919904813656E-002] fbridge_mode=0 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.4605s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3350s - [COUNTERS] Fortran MEs ( 1 ) : 0.1256s for 90112 events => throughput is 7.18E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4892s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3640s + [COUNTERS] Fortran MEs ( 1 ) : 0.1253s for 90112 events => throughput is 7.19E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -133,9 +133,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747166140620297] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1828s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1771s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0057s for 8192 events => throughput is 1.45E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1765s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1709s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0056s for 8192 events => throughput is 1.45E+06 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -166,9 +166,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0915 [9.1501907784661565E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.4079s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3449s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0630s for 90112 events => throughput is 1.43E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.4045s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3421s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0625s for 90112 events => throughput is 1.44E+06 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -181,12 +181,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.384435e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.366086e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.401232e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.411236e+06 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -209,9 +209,9 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747165549479658] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1775s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1758s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0016s for 8192 events => throughput is 5.02E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1681s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1666s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0016s for 8192 events => throughput is 5.27E+06 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -242,9 +242,9 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0915 [9.1501905692857932E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.3816s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3628s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0188s for 90112 events => throughput is 4.79E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3520s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3347s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0172s for 90112 events => throughput is 5.23E+06 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -257,12 +257,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.201858e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.178390e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.497702e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.517565e+06 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -285,9 +285,9 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747165569099927] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1811s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1802s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0009s for 8192 events => throughput is 8.71E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1682s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1674s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0008s for 8192 events => throughput is 9.72E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -318,9 +318,9 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0915 [9.1501905658047333E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.3548s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3454s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0094s for 90112 events => throughput is 9.63E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3407s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3314s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0094s for 90112 events => throughput is 9.61E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -333,12 +333,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.088326e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.065328e+07 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.152504e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.184488e+07 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -361,9 +361,9 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747165569099927] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1749s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1742s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0008s for 8192 events => throughput is 1.05E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.1673s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1665s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0008s for 8192 events => throughput is 1.03E+07 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -394,9 +394,9 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0915 [9.1501905658047333E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.3512s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3423s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0088s for 90112 events => throughput is 1.02E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.3433s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3345s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0087s for 90112 events => throughput is 1.03E+07 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -409,12 +409,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.108579e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.074978e+07 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.241538e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.114652e+07 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -437,9 +437,9 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747166431914253] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1731s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1721s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0010s for 8192 events => throughput is 7.98E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1711s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1701s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0009s for 8192 events => throughput is 8.67E+06 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -470,9 +470,9 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0915 [9.1501909358591468E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.3579s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3476s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0103s for 90112 events => throughput is 8.76E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3454s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3353s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0102s for 90112 events => throughput is 8.87E+06 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -485,12 +485,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.645409e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.877305e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.019707e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.149185e+07 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -513,9 +513,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747166796068879] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.6256s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6252s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0005s for 8192 events => throughput is 1.75E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.6303s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6298s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0005s for 8192 events => throughput is 1.72E+07 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -546,9 +546,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0915 [9.1501910316213061E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.7953s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7906s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0046s for 90112 events => throughput is 1.95E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.7969s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7923s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0045s for 90112 events => throughput is 1.98E+07 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** @@ -561,41 +561,41 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.144847e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.510070e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.291546e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.150527e+08 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.427391e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.548906e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.483788e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.466611e+09 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.690073e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.695373e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.736153e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.743025e+09 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.082066e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.786521e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.402127e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.372126e+08 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt index 3102d673e5..08471d1c00 100644 --- a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt @@ -2,26 +2,26 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/e CUDACPP_BUILDDIR='.' - make USEBUILDDIR=1 AVX=none + make USEBUILDDIR=1 AVX=sse4 make USEBUILDDIR=1 AVX=avx2 +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. @@ -33,7 +33,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2023-07-20_22:08:26 +DATE: 2023-06-16_23:35:08 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum @@ -58,9 +58,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/av [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [] fbridge_mode=0 - [COUNTERS] PROGRAM TOTAL : 0.0304s - [COUNTERS] Fortran Overhead ( 0 ) : 0.0192s - [COUNTERS] Fortran MEs ( 1 ) : 0.0112s for 8192 events => throughput is 7.28E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.0313s + [COUNTERS] Fortran Overhead ( 0 ) : 0.0197s + [COUNTERS] Fortran MEs ( 1 ) : 0.0115s for 8192 events => throughput is 7.11E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,9 +83,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=0 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1746s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1634s - [COUNTERS] Fortran MEs ( 1 ) : 0.0113s for 8192 events => throughput is 7.27E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.1766s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1651s + [COUNTERS] Fortran MEs ( 1 ) : 0.0116s for 8192 events => throughput is 7.08E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -108,9 +108,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x10_fortran > /tmp/a [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0915 [9.1501919904813656E-002] fbridge_mode=0 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.4673s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3395s - [COUNTERS] Fortran MEs ( 1 ) : 0.1278s for 90112 events => throughput is 7.05E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4702s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3402s + [COUNTERS] Fortran MEs ( 1 ) : 0.1300s for 90112 events => throughput is 6.93E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -133,9 +133,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747169074211728] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1800s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1741s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0060s for 8192 events => throughput is 1.37E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1796s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1734s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0062s for 8192 events => throughput is 1.33E+06 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -166,9 +166,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0915 [9.1501919915927155E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.4109s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3447s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0662s for 90112 events => throughput is 1.36E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.4089s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3434s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0655s for 90112 events => throughput is 1.38E+06 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -181,12 +181,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.314415e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.316240e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.325033e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.353376e+06 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -209,9 +209,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747169074211728] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1739s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1710s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0029s for 8192 events => throughput is 2.84E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1732s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1703s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0029s for 8192 events => throughput is 2.85E+06 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -242,9 +242,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0915 [9.1501919915927155E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.3678s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3362s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0316s for 90112 events => throughput is 2.86E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3857s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3519s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0338s for 90112 events => throughput is 2.66E+06 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -257,12 +257,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.778171e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.780240e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.881949e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.942864e+06 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -285,9 +285,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747169063975949] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1664s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1650s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0015s for 8192 events => throughput is 5.50E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1715s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1699s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0016s for 8192 events => throughput is 5.22E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -318,9 +318,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0915 [9.1501919908700741E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.3523s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3356s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0167s for 90112 events => throughput is 5.39E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3543s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3369s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0174s for 90112 events => throughput is 5.17E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -333,12 +333,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.309933e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.231827e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.865678e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.881656e+06 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -361,9 +361,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747169063975949] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1694s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1678s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0016s for 8192 events => throughput is 5.28E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1708s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1692s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0015s for 8192 events => throughput is 5.38E+06 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -394,9 +394,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0915 [9.1501919908700741E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.3518s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3357s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0161s for 90112 events => throughput is 5.60E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3556s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3393s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0164s for 90112 events => throughput is 5.51E+06 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -409,12 +409,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.692846e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.595835e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.371329e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.329466e+06 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -437,9 +437,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747169063975949] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1681s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1665s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0016s for 8192 events => throughput is 5.08E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1706s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1690s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0016s for 8192 events => throughput is 5.02E+06 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -470,9 +470,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0915 [9.1501919908700741E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.3597s + [COUNTERS] PROGRAM TOTAL : 0.3594s [COUNTERS] Fortran Overhead ( 0 ) : 0.3416s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0181s for 90112 events => throughput is 4.99E+06 events/s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0178s for 90112 events => throughput is 5.06E+06 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -485,12 +485,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.697754e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.773234e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.185493e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.345326e+06 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -511,15 +511,15 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_ [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2175 [0.21747169066587257] fbridge_mode=1 + [XSECTION] Cross section = 0.2175 [0.21747169066587255] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.6224s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6219s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0005s for 8192 events => throughput is 1.65E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.6290s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6285s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0005s for 8192 events => throughput is 1.67E+07 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21747169064681776) and cpp (0.21747169066587257) differ by less than 2E-4 (8.761968928183705e-11) +OK! xsec from fortran (0.21747169064681776) and cpp (0.21747169066587255) differ by less than 2E-4 (8.761968928183705e-11) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -546,9 +546,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0915 [9.1501919911173610E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.7929s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7881s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0048s for 90112 events => throughput is 1.87E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.7937s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7888s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0049s for 90112 events => throughput is 1.83E+07 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** @@ -561,41 +561,41 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.897587e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.341832e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.375732e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.367767e+08 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.509740e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.004324e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.014707e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.022759e+09 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.494163e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.001795e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.062796e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.064728e+09 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.492160e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.982012e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.984517e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.000317e+08 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt index 6835fd761b..9da4005516 100644 --- a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt @@ -1,25 +1,25 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx CUDACPP_BUILDDIR='.' -make USEBUILDDIR=1 AVX=none + +make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 make USEBUILDDIR=1 AVX=avx2 - +make USEBUILDDIR=1 AVX=512y make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 AVX=512y make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. @@ -33,7 +33,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2023-07-20_22:08:43 +DATE: 2023-06-16_23:35:24 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx @@ -58,9 +58,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/aval [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [] fbridge_mode=0 - [COUNTERS] PROGRAM TOTAL : 0.2502s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1961s - [COUNTERS] Fortran MEs ( 1 ) : 0.0541s for 8192 events => throughput is 1.51E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2030s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1493s + [COUNTERS] Fortran MEs ( 1 ) : 0.0538s for 8192 events => throughput is 1.52E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,9 +83,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/aval [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690708277600116] fbridge_mode=0 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.3147s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2601s - [COUNTERS] Fortran MEs ( 1 ) : 0.0545s for 8192 events => throughput is 1.50E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3135s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2598s + [COUNTERS] Fortran MEs ( 1 ) : 0.0536s for 8192 events => throughput is 1.53E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -108,9 +108,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x10_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 46.22 [46.223782291775365] fbridge_mode=0 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.7733s - [COUNTERS] Fortran Overhead ( 0 ) : 1.1842s - [COUNTERS] Fortran MEs ( 1 ) : 0.5891s for 90112 events => throughput is 1.53E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.7753s + [COUNTERS] Fortran Overhead ( 0 ) : 1.1856s + [COUNTERS] Fortran MEs ( 1 ) : 0.5897s for 90112 events => throughput is 1.53E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -133,9 +133,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690708277600102] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.3403s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2995s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0409s for 8192 events => throughput is 2.00E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3502s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3091s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0411s for 8192 events => throughput is 1.99E+05 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -166,9 +166,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 46.22 [46.223782291775379] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.6869s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2329s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.4540s for 90112 events => throughput is 1.98E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.6970s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2435s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4535s for 90112 events => throughput is 1.99E+05 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -181,12 +181,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.005444e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.992097e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.979728e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.951938e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -209,9 +209,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690708277600109] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.3103s + [COUNTERS] PROGRAM TOTAL : 0.3095s [COUNTERS] Fortran Overhead ( 0 ) : 0.2872s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0231s for 8192 events => throughput is 3.54E+05 events/s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0223s for 8192 events => throughput is 3.68E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -242,9 +242,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 46.22 [46.223782291775379] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.4708s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2163s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2545s for 90112 events => throughput is 3.54E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.4568s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2139s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2429s for 90112 events => throughput is 3.71E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -257,12 +257,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.618962e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.737410e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.549989e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.630060e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -285,8 +285,8 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690708277600109] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.2882s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2759s + [COUNTERS] PROGRAM TOTAL : 0.2885s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2762s [COUNTERS] CudaCpp MEs ( 2 ) : 0.0123s for 8192 events => throughput is 6.65E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -318,9 +318,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 46.22 [46.223782291775393] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.3465s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2084s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1381s for 90112 events => throughput is 6.53E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.3388s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2039s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1350s for 90112 events => throughput is 6.68E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -333,12 +333,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.181171e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.378851e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.333764e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.500232e+05 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -361,9 +361,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690708277600109] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.2857s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2739s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0118s for 8192 events => throughput is 6.95E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2876s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2765s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0111s for 8192 events => throughput is 7.39E+05 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -394,9 +394,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 46.22 [46.223782291775393] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.3369s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2095s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1274s for 90112 events => throughput is 7.07E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.3666s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2422s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1244s for 90112 events => throughput is 7.24E+05 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -409,12 +409,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.679320e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.995934e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.731519e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.995046e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -437,9 +437,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690708277600109] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.2991s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2809s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0182s for 8192 events => throughput is 4.50E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3014s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2827s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0186s for 8192 events => throughput is 4.39E+05 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -470,9 +470,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 46.22 [46.223782291775393] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.4162s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2137s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2025s for 90112 events => throughput is 4.45E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.4130s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2131s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1999s for 90112 events => throughput is 4.51E+05 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -485,12 +485,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.391750e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.352063e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.219610e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.264252e+05 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -513,9 +513,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690708277600109] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.7158s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7152s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0006s for 8192 events => throughput is 1.46E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.7320s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7314s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0006s for 8192 events => throughput is 1.38E+07 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -546,9 +546,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 46.22 [46.223782291775386] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.6552s - [COUNTERS] Fortran Overhead ( 0 ) : 1.6488s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0064s for 90112 events => throughput is 1.41E+07 events/s + [COUNTERS] PROGRAM TOTAL : 1.6728s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6663s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0065s for 90112 events => throughput is 1.39E+07 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** @@ -561,41 +561,41 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.992775e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.883699e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.639116e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.569644e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.831733e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.587002e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.076823e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.074126e+08 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.823039e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.570667e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.150288e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.153037e+08 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.844704e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.562530e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.023470e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.066634e+07 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt index 5579fe8617..01adf8925b 100644 --- a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt @@ -15,17 +15,17 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make USEBUILDDIR=1 AVX=512z make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. @@ -33,7 +33,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2023-07-20_22:09:09 +DATE: 2023-06-16_23:35:50 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx @@ -58,9 +58,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/aval [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [] fbridge_mode=0 - [COUNTERS] PROGRAM TOTAL : 0.2009s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1471s - [COUNTERS] Fortran MEs ( 1 ) : 0.0538s for 8192 events => throughput is 1.52E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2007s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1470s + [COUNTERS] Fortran MEs ( 1 ) : 0.0537s for 8192 events => throughput is 1.52E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,9 +83,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/aval [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690708277600116] fbridge_mode=0 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.3628s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3076s - [COUNTERS] Fortran MEs ( 1 ) : 0.0552s for 8192 events => throughput is 1.48E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3500s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2965s + [COUNTERS] Fortran MEs ( 1 ) : 0.0535s for 8192 events => throughput is 1.53E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -108,9 +108,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x10_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 46.22 [46.223782291775365] fbridge_mode=0 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.7742s - [COUNTERS] Fortran Overhead ( 0 ) : 1.1846s - [COUNTERS] Fortran MEs ( 1 ) : 0.5896s for 90112 events => throughput is 1.53E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.7703s + [COUNTERS] Fortran Overhead ( 0 ) : 1.1795s + [COUNTERS] Fortran MEs ( 1 ) : 0.5908s for 90112 events => throughput is 1.53E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -133,9 +133,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690706211693573] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.3651s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3244s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0407s for 8192 events => throughput is 2.01E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3412s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3013s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0398s for 8192 events => throughput is 2.06E+05 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -166,9 +166,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 46.22 [46.223782418787778] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.6679s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2310s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.4369s for 90112 events => throughput is 2.06E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.6722s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2335s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4387s for 90112 events => throughput is 2.05E+05 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -181,12 +181,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.966734e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.020933e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.016967e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.051565e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -209,9 +209,9 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690707641465352] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.3112s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2946s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0167s for 8192 events => throughput is 4.92E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2975s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2808s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0167s for 8192 events => throughput is 4.90E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -242,9 +242,9 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 46.22 [46.223786452345514] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.3982s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2176s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1807s for 90112 events => throughput is 4.99E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.3782s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2071s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1711s for 90112 events => throughput is 5.27E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -257,12 +257,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.085609e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.159675e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.924733e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.216268e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -285,9 +285,9 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690698819656767] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.2750s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2678s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0072s for 8192 events => throughput is 1.13E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.2780s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2713s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0067s for 8192 events => throughput is 1.23E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -318,9 +318,9 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 46.22 [46.223782736292961] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.2819s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2022s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0797s for 90112 events => throughput is 1.13E+06 events/s + [COUNTERS] PROGRAM TOTAL : 1.2805s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2059s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0747s for 90112 events => throughput is 1.21E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -333,12 +333,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.077134e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.150882e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.066914e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.149098e+06 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -361,9 +361,9 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690698819656767] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.2727s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2660s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0068s for 8192 events => throughput is 1.21E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.2758s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2693s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0064s for 8192 events => throughput is 1.27E+06 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -394,9 +394,9 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 46.22 [46.223782736292961] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.2657s - [COUNTERS] Fortran Overhead ( 0 ) : 1.1919s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0737s for 90112 events => throughput is 1.22E+06 events/s + [COUNTERS] PROGRAM TOTAL : 1.2778s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2090s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0688s for 90112 events => throughput is 1.31E+06 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -409,12 +409,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.160532e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.195362e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.196653e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.247695e+06 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -437,9 +437,9 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690703490151122] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.2807s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2705s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0102s for 8192 events => throughput is 8.02E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2869s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2776s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0094s for 8192 events => throughput is 8.73E+05 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -470,9 +470,9 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 46.22 [46.223787021597481] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.3324s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2235s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1088s for 90112 events => throughput is 8.28E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.3127s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2100s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1027s for 90112 events => throughput is 8.77E+05 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -485,12 +485,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.393193e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.078289e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.640628e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.098096e+05 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -513,9 +513,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690703397697980] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.7213s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7208s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0005s for 8192 events => throughput is 1.53E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.7221s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7216s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0006s for 8192 events => throughput is 1.46E+07 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -546,9 +546,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 46.22 [46.223786763175951] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.6790s - [COUNTERS] Fortran Overhead ( 0 ) : 1.6735s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0055s for 90112 events => throughput is 1.63E+07 events/s + [COUNTERS] PROGRAM TOTAL : 1.6678s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6624s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0055s for 90112 events => throughput is 1.65E+07 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** @@ -561,41 +561,41 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.030695e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.070407e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.904056e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.904132e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.544562e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.021955e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.743689e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.726468e+08 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.597493e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.093398e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.853850e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.841474e+08 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.148658e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.753742e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.464534e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.383547e+07 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt index ac950dc182..bcc7cef90a 100644 --- a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt @@ -15,13 +15,13 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make USEBUILDDIR=1 AVX=512z make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. @@ -33,7 +33,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2023-07-20_22:09:34 +DATE: 2023-06-16_23:36:15 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx @@ -58,9 +58,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/aval [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [] fbridge_mode=0 - [COUNTERS] PROGRAM TOTAL : 0.2019s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1476s - [COUNTERS] Fortran MEs ( 1 ) : 0.0543s for 8192 events => throughput is 1.51E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2020s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1481s + [COUNTERS] Fortran MEs ( 1 ) : 0.0539s for 8192 events => throughput is 1.52E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,9 +83,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/aval [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690708277600116] fbridge_mode=0 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.3111s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2566s - [COUNTERS] Fortran MEs ( 1 ) : 0.0545s for 8192 events => throughput is 1.50E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3175s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2632s + [COUNTERS] Fortran MEs ( 1 ) : 0.0543s for 8192 events => throughput is 1.51E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -108,9 +108,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x10_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 46.22 [46.223782291775365] fbridge_mode=0 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.7649s - [COUNTERS] Fortran Overhead ( 0 ) : 1.1764s - [COUNTERS] Fortran MEs ( 1 ) : 0.5885s for 90112 events => throughput is 1.53E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.7727s + [COUNTERS] Fortran Overhead ( 0 ) : 1.1825s + [COUNTERS] Fortran MEs ( 1 ) : 0.5903s for 90112 events => throughput is 1.53E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -133,9 +133,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690709601032026] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.3455s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3037s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0418s for 8192 events => throughput is 1.96E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3466s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3052s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0414s for 8192 events => throughput is 1.98E+05 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -166,9 +166,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 46.22 [46.223783635280988] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.6999s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2377s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.4621s for 90112 events => throughput is 1.95E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.6933s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2360s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4573s for 90112 events => throughput is 1.97E+05 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -181,12 +181,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.954461e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.952836e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.963840e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.944744e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -209,9 +209,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690709601032026] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.3087s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2862s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0226s for 8192 events => throughput is 3.63E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3071s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2854s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0217s for 8192 events => throughput is 3.77E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -242,9 +242,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 46.22 [46.223783635280988] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.4797s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2199s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2598s for 90112 events => throughput is 3.47E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.4646s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2169s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2477s for 90112 events => throughput is 3.64E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -257,12 +257,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.513716e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.726713e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.551160e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.678276e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -285,9 +285,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690709681138244] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.2855s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2733s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0122s for 8192 events => throughput is 6.69E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2880s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2760s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0120s for 8192 events => throughput is 6.83E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -318,9 +318,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 46.22 [46.223783652032040] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.4261s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2891s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1369s for 90112 events => throughput is 6.58E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.3686s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2290s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1396s for 90112 events => throughput is 6.46E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -333,12 +333,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.158900e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.474630e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.187440e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.428506e+05 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -361,9 +361,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690709681138244] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.2837s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2725s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0112s for 8192 events => throughput is 7.30E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2848s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2738s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0110s for 8192 events => throughput is 7.42E+05 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -394,9 +394,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 46.22 [46.223783652032040] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.3314s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2068s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1246s for 90112 events => throughput is 7.23E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.3245s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2028s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1217s for 90112 events => throughput is 7.40E+05 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -409,12 +409,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.995898e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.064546e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.193852e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.181463e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -437,9 +437,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690709681138244] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.2981s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2795s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0185s for 8192 events => throughput is 4.42E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3000s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2818s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0182s for 8192 events => throughput is 4.50E+05 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -470,9 +470,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 46.22 [46.223783652032040] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.4111s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2140s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1970s for 90112 events => throughput is 4.57E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.4141s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2163s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1978s for 90112 events => throughput is 4.55E+05 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -485,12 +485,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.328138e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.215527e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTX_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.536361e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.203869e+05 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -513,9 +513,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690708266690699] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.7374s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7369s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0006s for 8192 events => throughput is 1.41E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.7253s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7248s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0006s for 8192 events => throughput is 1.40E+07 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -546,9 +546,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 46.22 [46.223782303744791] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.6676s - [COUNTERS] Fortran Overhead ( 0 ) : 1.6612s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0064s for 90112 events => throughput is 1.41E+07 events/s + [COUNTERS] PROGRAM TOTAL : 1.6584s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6519s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0065s for 90112 events => throughput is 1.39E+07 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** @@ -561,41 +561,41 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.995936e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.858157e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.581788e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.585975e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.823598e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.575302e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.052568e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.048555e+08 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.805134e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.576352e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.131874e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.132080e+08 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.821685e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.576975e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.023937e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.969589e+07 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt index 6db4b12575..52294c86ed 100644 --- a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt @@ -2,38 +2,38 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/g CUDACPP_BUILDDIR='.' + make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 make USEBUILDDIR=1 AVX=avx2 +make USEBUILDDIR=1 AVX=512y make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' - -make USEBUILDDIR=1 AVX=512y make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512z make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' OMP_NUM_THREADS= -DATE: 2023-07-20_22:09:59 +DATE: 2023-06-16_23:36:41 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg @@ -58,9 +58,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/ava [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0972 [] fbridge_mode=0 - [COUNTERS] PROGRAM TOTAL : 0.7072s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3574s - [COUNTERS] Fortran MEs ( 1 ) : 0.3498s for 8192 events => throughput is 2.34E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.5432s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1981s + [COUNTERS] Fortran MEs ( 1 ) : 0.3452s for 8192 events => throughput is 2.37E+04 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,9 +83,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0972 [9.7195719386171234E-002] fbridge_mode=0 [UNWEIGHT] Wrote 40 events (found 738 events) - [COUNTERS] PROGRAM TOTAL : 0.6014s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2492s - [COUNTERS] Fortran MEs ( 1 ) : 0.3523s for 8192 events => throughput is 2.33E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.5864s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2409s + [COUNTERS] Fortran MEs ( 1 ) : 0.3455s for 8192 events => throughput is 2.37E+04 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -108,9 +108,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x10_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.08131 [8.1310872844967921E-002] fbridge_mode=0 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 5.1681s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3720s - [COUNTERS] Fortran MEs ( 1 ) : 3.7961s for 90112 events => throughput is 2.37E+04 events/s + [COUNTERS] PROGRAM TOTAL : 5.1533s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3710s + [COUNTERS] Fortran MEs ( 1 ) : 3.7823s for 90112 events => throughput is 2.38E+04 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -133,9 +133,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0972 [9.7195719386171234E-002] fbridge_mode=1 [UNWEIGHT] Wrote 40 events (found 738 events) - [COUNTERS] PROGRAM TOTAL : 0.8675s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5519s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3156s for 8192 events => throughput is 2.60E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.8655s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5504s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3151s for 8192 events => throughput is 2.60E+04 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -166,9 +166,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.08131 [8.1310872844967963E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 5.1987s - [COUNTERS] Fortran Overhead ( 0 ) : 1.7046s - [COUNTERS] CudaCpp MEs ( 2 ) : 3.4941s for 90112 events => throughput is 2.58E+04 events/s + [COUNTERS] PROGRAM TOTAL : 5.2253s + [COUNTERS] Fortran Overhead ( 0 ) : 1.7057s + [COUNTERS] CudaCpp MEs ( 2 ) : 3.5196s for 90112 events => throughput is 2.56E+04 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -181,12 +181,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.672832e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.634187e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.667309e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.657523e+04 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -209,9 +209,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0972 [9.7195719386171234E-002] fbridge_mode=1 [UNWEIGHT] Wrote 40 events (found 738 events) - [COUNTERS] PROGRAM TOTAL : 0.5882s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4201s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1681s for 8192 events => throughput is 4.87E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.5827s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4172s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1654s for 8192 events => throughput is 4.95E+04 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -242,9 +242,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.08131 [8.1310872844967921E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 3.5007s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5875s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.9133s for 90112 events => throughput is 4.71E+04 events/s + [COUNTERS] PROGRAM TOTAL : 3.4871s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5641s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.9229s for 90112 events => throughput is 4.69E+04 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -257,12 +257,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.996427e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.964351e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.063044e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.061720e+04 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -285,9 +285,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0972 [9.7195719386171206E-002] fbridge_mode=1 [UNWEIGHT] Wrote 40 events (found 738 events) - [COUNTERS] PROGRAM TOTAL : 0.4048s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3261s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0787s for 8192 events => throughput is 1.04E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4029s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3246s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0783s for 8192 events => throughput is 1.05E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -318,9 +318,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.08131 [8.1310872844967907E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 2.3504s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4793s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.8710s for 90112 events => throughput is 1.03E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.3698s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4879s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.8819s for 90112 events => throughput is 1.02E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -333,12 +333,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.055678e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.047204e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.048985e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.053567e+05 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -361,9 +361,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0972 [9.7195719386171206E-002] fbridge_mode=1 [UNWEIGHT] Wrote 40 events (found 738 events) - [COUNTERS] PROGRAM TOTAL : 0.3852s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3140s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0712s for 8192 events => throughput is 1.15E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3886s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3177s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0709s for 8192 events => throughput is 1.15E+05 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -394,9 +394,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.08131 [8.1310872844967907E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 2.2636s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4748s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.7888s for 90112 events => throughput is 1.14E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.2440s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4672s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.7768s for 90112 events => throughput is 1.16E+05 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -409,12 +409,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.151866e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.177490e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.192009e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.191999e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -437,9 +437,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0972 [9.7195719386171234E-002] fbridge_mode=1 [UNWEIGHT] Wrote 40 events (found 738 events) - [COUNTERS] PROGRAM TOTAL : 0.4444s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3446s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0997s for 8192 events => throughput is 8.21E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.4369s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3403s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0966s for 8192 events => throughput is 8.48E+04 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -470,9 +470,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.08131 [8.1310872844967907E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 2.6071s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4983s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.1089s for 90112 events => throughput is 8.13E+04 events/s + [COUNTERS] PROGRAM TOTAL : 2.5408s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4804s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.0604s for 90112 events => throughput is 8.50E+04 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -485,12 +485,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.135325e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.369247e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.227114e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.461216e+04 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -513,9 +513,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0972 [9.7195719386171220E-002] fbridge_mode=1 [UNWEIGHT] Wrote 40 events (found 738 events) - [COUNTERS] PROGRAM TOTAL : 0.7046s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7029s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0017s for 8192 events => throughput is 4.72E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.7104s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7087s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0017s for 8192 events => throughput is 4.77E+06 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -546,9 +546,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.08131 [8.1310872844967977E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 1.8739s - [COUNTERS] Fortran Overhead ( 0 ) : 1.8548s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0191s for 90112 events => throughput is 4.71E+06 events/s + [COUNTERS] PROGRAM TOTAL : 1.8754s + [COUNTERS] Fortran Overhead ( 0 ) : 1.8562s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0193s for 90112 events => throughput is 4.68E+06 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** @@ -561,41 +561,41 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.181078e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.108652e+06 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.839865e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.056830e+06 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.407489e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.357483e+06 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.237597e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.238541e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.400140e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.098545e+06 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.253060e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.247043e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.421827e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.065031e+06 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.789057e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.797581e+06 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt index e5e5a94357..ac98f0132b 100644 --- a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt @@ -2,11 +2,11 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/g CUDACPP_BUILDDIR='.' + make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 make USEBUILDDIR=1 AVX=avx2 - make USEBUILDDIR=1 AVX=512y make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' @@ -15,25 +15,25 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make USEBUILDDIR=1 AVX=512z make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' OMP_NUM_THREADS= -DATE: 2023-07-20_22:10:41 +DATE: 2023-06-16_23:37:23 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg @@ -58,9 +58,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/ava [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0972 [] fbridge_mode=0 - [COUNTERS] PROGRAM TOTAL : 0.5067s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1651s - [COUNTERS] Fortran MEs ( 1 ) : 0.3415s for 8192 events => throughput is 2.40E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.5130s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1657s + [COUNTERS] Fortran MEs ( 1 ) : 0.3473s for 8192 events => throughput is 2.36E+04 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,9 +83,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0972 [9.7195719386171234E-002] fbridge_mode=0 [UNWEIGHT] Wrote 40 events (found 738 events) - [COUNTERS] PROGRAM TOTAL : 0.5851s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2400s - [COUNTERS] Fortran MEs ( 1 ) : 0.3451s for 8192 events => throughput is 2.37E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.6055s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2536s + [COUNTERS] Fortran MEs ( 1 ) : 0.3519s for 8192 events => throughput is 2.33E+04 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -108,9 +108,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x10_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.08131 [8.1310872844967921E-002] fbridge_mode=0 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 5.1578s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3685s - [COUNTERS] Fortran MEs ( 1 ) : 3.7893s for 90112 events => throughput is 2.38E+04 events/s + [COUNTERS] PROGRAM TOTAL : 5.1398s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3678s + [COUNTERS] Fortran MEs ( 1 ) : 3.7720s for 90112 events => throughput is 2.39E+04 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -133,9 +133,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0972 [9.7195711188152623E-002] fbridge_mode=1 [UNWEIGHT] Wrote 40 events (found 738 events) - [COUNTERS] PROGRAM TOTAL : 0.8560s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5449s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3111s for 8192 events => throughput is 2.63E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.8594s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5459s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3135s for 8192 events => throughput is 2.61E+04 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -166,9 +166,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.08131 [8.1310861450156910E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 5.1196s - [COUNTERS] Fortran Overhead ( 0 ) : 1.7009s - [COUNTERS] CudaCpp MEs ( 2 ) : 3.4187s for 90112 events => throughput is 2.64E+04 events/s + [COUNTERS] PROGRAM TOTAL : 5.1084s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6927s + [COUNTERS] CudaCpp MEs ( 2 ) : 3.4157s for 90112 events => throughput is 2.64E+04 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -181,12 +181,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.706286e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.731642e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.700736e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.740748e+04 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -209,9 +209,9 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0972 [9.7195687405490658E-002] fbridge_mode=1 [UNWEIGHT] Wrote 40 events (found 738 events) - [COUNTERS] PROGRAM TOTAL : 0.4377s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3445s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0932s for 8192 events => throughput is 8.79E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.4313s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3404s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0909s for 8192 events => throughput is 9.02E+04 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -242,9 +242,9 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.08131 [8.1310854844234101E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 2.5473s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4834s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.0639s for 90112 events => throughput is 8.47E+04 events/s + [COUNTERS] PROGRAM TOTAL : 2.5514s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4872s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.0642s for 90112 events => throughput is 8.47E+04 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -257,12 +257,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.429280e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.703250e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.048509e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.982646e+04 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -285,9 +285,9 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0972 [9.7195715140566227E-002] fbridge_mode=1 [UNWEIGHT] Wrote 40 events (found 738 events) - [COUNTERS] PROGRAM TOTAL : 0.3251s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2845s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0406s for 8192 events => throughput is 2.02E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3260s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2868s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0392s for 8192 events => throughput is 2.09E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -318,9 +318,9 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.08131 [8.1310851236127482E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 1.8985s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4487s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.4499s for 90112 events => throughput is 2.00E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.8684s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4327s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4357s for 90112 events => throughput is 2.07E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -333,12 +333,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.989595e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.029319e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.062720e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.080772e+05 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -361,9 +361,9 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0972 [9.7195715140566227E-002] fbridge_mode=1 [UNWEIGHT] Wrote 40 events (found 738 events) - [COUNTERS] PROGRAM TOTAL : 0.3148s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2783s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0365s for 8192 events => throughput is 2.25E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3161s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2805s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0356s for 8192 events => throughput is 2.30E+05 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -394,9 +394,9 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.08131 [8.1310851236127482E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 1.8306s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4306s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.4000s for 90112 events => throughput is 2.25E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.8280s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4335s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3946s for 90112 events => throughput is 2.28E+05 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -409,12 +409,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.270425e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.320491e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.241221e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.339938e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -437,9 +437,9 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0972 [9.7195727520443878E-002] fbridge_mode=1 [UNWEIGHT] Wrote 40 events (found 738 events) - [COUNTERS] PROGRAM TOTAL : 0.3449s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2957s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0493s for 8192 events => throughput is 1.66E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3397s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2926s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0470s for 8192 events => throughput is 1.74E+05 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -470,9 +470,9 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.08131 [8.1310861771879989E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 1.9827s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4449s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.5378s for 90112 events => throughput is 1.68E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.9470s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4325s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.5145s for 90112 events => throughput is 1.75E+05 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -485,12 +485,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.634127e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.739989e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.637008e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.725640e+05 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -513,9 +513,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0972 [9.7195710869056637E-002] fbridge_mode=1 [UNWEIGHT] Wrote 40 events (found 738 events) - [COUNTERS] PROGRAM TOTAL : 0.7056s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7048s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0009s for 8192 events => throughput is 9.54E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.7105s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7096s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0009s for 8192 events => throughput is 9.53E+06 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -546,9 +546,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.08131 [8.1310865716831132E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 1.8606s - [COUNTERS] Fortran Overhead ( 0 ) : 1.8510s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0095s for 90112 events => throughput is 9.46E+06 events/s + [COUNTERS] PROGRAM TOTAL : 1.8990s + [COUNTERS] Fortran Overhead ( 0 ) : 1.8894s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0097s for 90112 events => throughput is 9.33E+06 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** @@ -561,41 +561,41 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.297311e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.264826e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.852891e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.844127e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.581946e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.461458e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.391631e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.280416e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.592998e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.442329e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.444681e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.431551e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.459085e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.330311e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.593672e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.602408e+07 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt index 7d215cd993..de3b503603 100644 --- a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt @@ -7,33 +7,33 @@ make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 make USEBUILDDIR=1 AVX=avx2 +make USEBUILDDIR=1 AVX=512y make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make USEBUILDDIR=1 AVX=512y make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512z make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' OMP_NUM_THREADS= -DATE: 2023-07-20_22:11:19 +DATE: 2023-06-16_23:38:00 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg @@ -58,9 +58,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/ava [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0972 [] fbridge_mode=0 - [COUNTERS] PROGRAM TOTAL : 0.5131s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1661s - [COUNTERS] Fortran MEs ( 1 ) : 0.3470s for 8192 events => throughput is 2.36E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.5113s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1671s + [COUNTERS] Fortran MEs ( 1 ) : 0.3442s for 8192 events => throughput is 2.38E+04 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,9 +83,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0972 [9.7195719386171234E-002] fbridge_mode=0 [UNWEIGHT] Wrote 40 events (found 738 events) - [COUNTERS] PROGRAM TOTAL : 0.5859s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2416s - [COUNTERS] Fortran MEs ( 1 ) : 0.3443s for 8192 events => throughput is 2.38E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.6040s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2440s + [COUNTERS] Fortran MEs ( 1 ) : 0.3600s for 8192 events => throughput is 2.28E+04 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -108,9 +108,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x10_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.08131 [8.1310872844967921E-002] fbridge_mode=0 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 5.2070s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3904s - [COUNTERS] Fortran MEs ( 1 ) : 3.8166s for 90112 events => throughput is 2.36E+04 events/s + [COUNTERS] PROGRAM TOTAL : 5.1624s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3666s + [COUNTERS] Fortran MEs ( 1 ) : 3.7958s for 90112 events => throughput is 2.37E+04 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -133,9 +133,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0972 [9.7195720226233587E-002] fbridge_mode=1 [UNWEIGHT] Wrote 40 events (found 738 events) - [COUNTERS] PROGRAM TOTAL : 0.8784s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5586s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3198s for 8192 events => throughput is 2.56E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.8805s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5588s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3217s for 8192 events => throughput is 2.55E+04 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -166,9 +166,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.08131 [8.1310873602323142E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 5.2676s - [COUNTERS] Fortran Overhead ( 0 ) : 1.7131s - [COUNTERS] CudaCpp MEs ( 2 ) : 3.5545s for 90112 events => throughput is 2.54E+04 events/s + [COUNTERS] PROGRAM TOTAL : 5.2705s + [COUNTERS] Fortran Overhead ( 0 ) : 1.7115s + [COUNTERS] CudaCpp MEs ( 2 ) : 3.5590s for 90112 events => throughput is 2.53E+04 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -181,12 +181,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.599022e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.610260e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.610033e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.619292e+04 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -209,9 +209,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0972 [9.7195720267415450E-002] fbridge_mode=1 [UNWEIGHT] Wrote 40 events (found 738 events) - [COUNTERS] PROGRAM TOTAL : 0.5748s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4062s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1686s for 8192 events => throughput is 4.86E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.5849s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4204s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1645s for 8192 events => throughput is 4.98E+04 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -242,9 +242,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.08131 [8.1310873604102080E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 3.4996s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5780s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.9216s for 90112 events => throughput is 4.69E+04 events/s + [COUNTERS] PROGRAM TOTAL : 3.4135s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5611s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.8524s for 90112 events => throughput is 4.86E+04 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -257,12 +257,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.763374e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.764708e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.760811e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.772730e+04 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -285,9 +285,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0972 [9.7195720049465126E-002] fbridge_mode=1 [UNWEIGHT] Wrote 40 events (found 738 events) - [COUNTERS] PROGRAM TOTAL : 0.4034s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3255s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0779s for 8192 events => throughput is 1.05E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4005s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3219s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0786s for 8192 events => throughput is 1.04E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -318,9 +318,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.08131 [8.1310873476230255E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 2.3291s + [COUNTERS] PROGRAM TOTAL : 2.3277s [COUNTERS] Fortran Overhead ( 0 ) : 1.4694s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.8597s for 90112 events => throughput is 1.05E+05 events/s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.8583s for 90112 events => throughput is 1.05E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -333,12 +333,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.056094e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.054740e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.056229e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.057335e+05 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -361,9 +361,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0972 [9.7195720049465126E-002] fbridge_mode=1 [UNWEIGHT] Wrote 40 events (found 738 events) - [COUNTERS] PROGRAM TOTAL : 0.3850s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3150s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0700s for 8192 events => throughput is 1.17E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3809s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3126s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0683s for 8192 events => throughput is 1.20E+05 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -394,9 +394,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.08131 [8.1310873476230255E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 2.2228s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4562s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.7666s for 90112 events => throughput is 1.18E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.2562s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4769s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.7793s for 90112 events => throughput is 1.16E+05 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -409,12 +409,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.188784e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.193684e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.206033e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.221094e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -437,9 +437,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0972 [9.7195720220276491E-002] fbridge_mode=1 [UNWEIGHT] Wrote 40 events (found 738 events) - [COUNTERS] PROGRAM TOTAL : 0.4552s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3501s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1051s for 8192 events => throughput is 7.79E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.4495s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3476s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1019s for 8192 events => throughput is 8.04E+04 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -470,9 +470,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.08131 [8.1310873571012007E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 2.6310s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4975s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.1335s for 90112 events => throughput is 7.95E+04 events/s + [COUNTERS] PROGRAM TOTAL : 2.6068s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4960s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.1108s for 90112 events => throughput is 8.11E+04 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -485,12 +485,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.962496e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.078718e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.967931e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.044468e+04 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -513,9 +513,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0972 [9.7195719566775987E-002] fbridge_mode=1 [UNWEIGHT] Wrote 40 events (found 738 events) - [COUNTERS] PROGRAM TOTAL : 0.7080s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7062s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0017s for 8192 events => throughput is 4.79E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.7145s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7128s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0018s for 8192 events => throughput is 4.63E+06 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -546,9 +546,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.08131 [8.1310872835946929E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 1.8776s - [COUNTERS] Fortran Overhead ( 0 ) : 1.8583s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0192s for 90112 events => throughput is 4.68E+06 events/s + [COUNTERS] PROGRAM TOTAL : 1.8693s + [COUNTERS] Fortran Overhead ( 0 ) : 1.8501s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0192s for 90112 events => throughput is 4.69E+06 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** @@ -561,41 +561,41 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.171619e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.091918e+06 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.617135e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.629216e+06 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.428025e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.133810e+06 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.236068e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.233339e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.402481e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.048619e+06 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.247568e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.238445e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.435104e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.073517e+06 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.765273e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.781744e+06 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt index a2dcd45cf7..56b03784ad 100644 --- a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt @@ -6,22 +6,22 @@ make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 make USEBUILDDIR=1 AVX=avx2 + +make USEBUILDDIR=1 AVX=512y make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' - -make USEBUILDDIR=1 AVX=512y make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. @@ -33,7 +33,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2023-07-20_22:12:01 +DATE: 2023-06-16_23:38:42 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg @@ -58,9 +58,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/av [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0003628 [] fbridge_mode=0 - [COUNTERS] PROGRAM TOTAL : 4.4298s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2555s - [COUNTERS] Fortran MEs ( 1 ) : 4.1743s for 8192 events => throughput is 1.96E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.3757s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2161s + [COUNTERS] Fortran MEs ( 1 ) : 4.1595s for 8192 events => throughput is 1.97E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,9 +83,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/av [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0003628 [3.6277277311352982E-004] fbridge_mode=0 [UNWEIGHT] Wrote 49 events (found 738 events) - [COUNTERS] PROGRAM TOTAL : 4.4495s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3001s - [COUNTERS] Fortran MEs ( 1 ) : 4.1494s for 8192 events => throughput is 1.97E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.4480s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2999s + [COUNTERS] Fortran MEs ( 1 ) : 4.1481s for 8192 events => throughput is 1.97E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -108,9 +108,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x10_fortran > /tmp/a [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000158 [1.5803725748610604E-004] fbridge_mode=0 [UNWEIGHT] Wrote 204 events (found 1633 events) - [COUNTERS] PROGRAM TOTAL : 47.4939s - [COUNTERS] Fortran Overhead ( 0 ) : 1.8948s - [COUNTERS] Fortran MEs ( 1 ) : 45.5991s for 90112 events => throughput is 1.98E+03 events/s + [COUNTERS] PROGRAM TOTAL : 47.5995s + [COUNTERS] Fortran Overhead ( 0 ) : 1.8973s + [COUNTERS] Fortran MEs ( 1 ) : 45.7021s for 90112 events => throughput is 1.97E+03 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -133,9 +133,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0003628 [3.6277277311352998E-004] fbridge_mode=1 [UNWEIGHT] Wrote 49 events (found 738 events) - [COUNTERS] PROGRAM TOTAL : 8.6209s - [COUNTERS] Fortran Overhead ( 0 ) : 4.4153s - [COUNTERS] CudaCpp MEs ( 2 ) : 4.2055s for 8192 events => throughput is 1.95E+03 events/s + [COUNTERS] PROGRAM TOTAL : 8.5650s + [COUNTERS] Fortran Overhead ( 0 ) : 4.3703s + [COUNTERS] CudaCpp MEs ( 2 ) : 4.1947s for 8192 events => throughput is 1.95E+03 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** diff --git a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt index 8bdf388153..fbd2ca3bdb 100644 --- a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt @@ -4,8 +4,8 @@ CUDACPP_BUILDDIR='.' make USEBUILDDIR=1 AVX=none - make USEBUILDDIR=1 AVX=sse4 + make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512y make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' @@ -15,17 +15,17 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make USEBUILDDIR=1 AVX=512z make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. @@ -33,7 +33,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2023-07-20_22:13:09 +DATE: 2023-06-16_23:39:50 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg @@ -58,9 +58,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/av [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0003628 [] fbridge_mode=0 - [COUNTERS] PROGRAM TOTAL : 4.3889s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2167s - [COUNTERS] Fortran MEs ( 1 ) : 4.1723s for 8192 events => throughput is 1.96E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.4077s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2143s + [COUNTERS] Fortran MEs ( 1 ) : 4.1934s for 8192 events => throughput is 1.95E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,9 +83,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/av [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0003628 [3.6277277311352982E-004] fbridge_mode=0 [UNWEIGHT] Wrote 49 events (found 738 events) - [COUNTERS] PROGRAM TOTAL : 4.4632s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3033s - [COUNTERS] Fortran MEs ( 1 ) : 4.1599s for 8192 events => throughput is 1.97E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.4393s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3039s + [COUNTERS] Fortran MEs ( 1 ) : 4.1354s for 8192 events => throughput is 1.98E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -108,9 +108,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x10_fortran > /tmp/a [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000158 [1.5803725748610604E-004] fbridge_mode=0 [UNWEIGHT] Wrote 204 events (found 1633 events) - [COUNTERS] PROGRAM TOTAL : 47.8293s - [COUNTERS] Fortran Overhead ( 0 ) : 1.9068s - [COUNTERS] Fortran MEs ( 1 ) : 45.9225s for 90112 events => throughput is 1.96E+03 events/s + [COUNTERS] PROGRAM TOTAL : 47.6726s + [COUNTERS] Fortran Overhead ( 0 ) : 1.8981s + [COUNTERS] Fortran MEs ( 1 ) : 45.7745s for 90112 events => throughput is 1.97E+03 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -133,9 +133,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0003628 [3.6277396515517582E-004] fbridge_mode=1 [UNWEIGHT] Wrote 49 events (found 738 events) - [COUNTERS] PROGRAM TOTAL : 8.3655s - [COUNTERS] Fortran Overhead ( 0 ) : 4.2729s - [COUNTERS] CudaCpp MEs ( 2 ) : 4.0926s for 8192 events => throughput is 2.00E+03 events/s + [COUNTERS] PROGRAM TOTAL : 8.4058s + [COUNTERS] Fortran Overhead ( 0 ) : 4.2739s + [COUNTERS] CudaCpp MEs ( 2 ) : 4.1319s for 8192 events => throughput is 1.98E+03 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** diff --git a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt index 3790f1ae10..c64d8630ed 100644 --- a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt @@ -5,20 +5,21 @@ CUDACPP_BUILDDIR='.' make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 - make USEBUILDDIR=1 AVX=avx2 -make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make USEBUILDDIR=1 AVX=512y + make USEBUILDDIR=1 AVX=512z make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. @@ -27,13 +28,12 @@ make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' OMP_NUM_THREADS= -DATE: 2023-07-20_22:14:17 +DATE: 2023-06-16_23:40:57 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg @@ -58,9 +58,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/av [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0003628 [] fbridge_mode=0 - [COUNTERS] PROGRAM TOTAL : 4.4609s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2133s - [COUNTERS] Fortran MEs ( 1 ) : 4.2476s for 8192 events => throughput is 1.93E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.4170s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2154s + [COUNTERS] Fortran MEs ( 1 ) : 4.2016s for 8192 events => throughput is 1.95E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,9 +83,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/av [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0003628 [3.6277277311352982E-004] fbridge_mode=0 [UNWEIGHT] Wrote 49 events (found 738 events) - [COUNTERS] PROGRAM TOTAL : 4.4921s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3024s - [COUNTERS] Fortran MEs ( 1 ) : 4.1897s for 8192 events => throughput is 1.96E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.6794s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3042s + [COUNTERS] Fortran MEs ( 1 ) : 4.3752s for 8192 events => throughput is 1.87E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -108,9 +108,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x10_fortran > /tmp/a [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000158 [1.5803725748610604E-004] fbridge_mode=0 [UNWEIGHT] Wrote 204 events (found 1633 events) - [COUNTERS] PROGRAM TOTAL : 47.5479s - [COUNTERS] Fortran Overhead ( 0 ) : 1.9002s - [COUNTERS] Fortran MEs ( 1 ) : 45.6476s for 90112 events => throughput is 1.97E+03 events/s + [COUNTERS] PROGRAM TOTAL : 47.7432s + [COUNTERS] Fortran Overhead ( 0 ) : 1.9035s + [COUNTERS] Fortran MEs ( 1 ) : 45.8397s for 90112 events => throughput is 1.97E+03 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -133,9 +133,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0003628 [3.6277277432965013E-004] fbridge_mode=1 [UNWEIGHT] Wrote 49 events (found 738 events) - [COUNTERS] PROGRAM TOTAL : 8.7180s - [COUNTERS] Fortran Overhead ( 0 ) : 4.4096s - [COUNTERS] CudaCpp MEs ( 2 ) : 4.3084s for 8192 events => throughput is 1.90E+03 events/s + [COUNTERS] PROGRAM TOTAL : 8.6986s + [COUNTERS] Fortran Overhead ( 0 ) : 4.4272s + [COUNTERS] CudaCpp MEs ( 2 ) : 4.2714s for 8192 events => throughput is 1.92E+03 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** diff --git a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt index 3a56c0dc26..39ca5692cf 100644 --- a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt @@ -2,28 +2,28 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/g CUDACPP_BUILDDIR='.' - make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 make USEBUILDDIR=1 AVX=avx2 -make USEBUILDDIR=1 AVX=512y make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' + +make USEBUILDDIR=1 AVX=512y make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512z make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. @@ -33,7 +33,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2023-07-20_22:15:25 +DATE: 2023-06-16_23:42:06 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg @@ -58,9 +58,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/a [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.169e-06 [] fbridge_mode=0 - [COUNTERS] PROGRAM TOTAL : 96.3038s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5615s - [COUNTERS] Fortran MEs ( 1 ) : 95.7423s for 8192 events => throughput is 8.56E+01 events/s + [COUNTERS] PROGRAM TOTAL : 96.2551s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4152s + [COUNTERS] Fortran MEs ( 1 ) : 95.8399s for 8192 events => throughput is 8.55E+01 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,9 +83,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/a [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.169e-06 [1.1693100945435808E-006] fbridge_mode=0 [UNWEIGHT] Wrote 14 events (found 457 events) - [COUNTERS] PROGRAM TOTAL : 96.9521s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4744s - [COUNTERS] Fortran MEs ( 1 ) : 96.4777s for 8192 events => throughput is 8.49E+01 events/s + [COUNTERS] PROGRAM TOTAL : 96.4720s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4731s + [COUNTERS] Fortran MEs ( 1 ) : 95.9989s for 8192 events => throughput is 8.53E+01 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -108,9 +108,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x10_fortran > /tmp/ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.136e-07 [2.1358436158813976E-007] fbridge_mode=0 [UNWEIGHT] Wrote 84 events (found 1181 events) - [COUNTERS] PROGRAM TOTAL : 1062.6840s - [COUNTERS] Fortran Overhead ( 0 ) : 4.0557s - [COUNTERS] Fortran MEs ( 1 ) : 1058.6283s for 90112 events => throughput is 8.51E+01 events/s + [COUNTERS] PROGRAM TOTAL : 1059.0573s + [COUNTERS] Fortran Overhead ( 0 ) : 4.0327s + [COUNTERS] Fortran MEs ( 1 ) : 1055.0245s for 90112 events => throughput is 8.54E+01 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -133,9 +133,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.169e-06 [1.1693100945435831E-006] fbridge_mode=1 [UNWEIGHT] Wrote 14 events (found 457 events) - [COUNTERS] PROGRAM TOTAL : 214.0450s - [COUNTERS] Fortran Overhead ( 0 ) : 97.9116s - [COUNTERS] CudaCpp MEs ( 2 ) : 116.1333s for 8192 events => throughput is 7.05E+01 events/s + [COUNTERS] PROGRAM TOTAL : 215.3874s + [COUNTERS] Fortran Overhead ( 0 ) : 96.8318s + [COUNTERS] CudaCpp MEs ( 2 ) : 118.5557s for 8192 events => throughput is 6.91E+01 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -166,9 +166,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.136e-07 [2.1358436158813953E-007] fbridge_mode=1 [UNWEIGHT] Wrote 84 events (found 1181 events) - [COUNTERS] PROGRAM TOTAL : 1382.3778s - [COUNTERS] Fortran Overhead ( 0 ) : 102.3927s - [COUNTERS] CudaCpp MEs ( 2 ) : 1279.9851s for 90112 events => throughput is 7.04E+01 events/s + [COUNTERS] PROGRAM TOTAL : 1357.2262s + [COUNTERS] Fortran Overhead ( 0 ) : 100.3230s + [COUNTERS] CudaCpp MEs ( 2 ) : 1256.9032s for 90112 events => throughput is 7.17E+01 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -181,12 +181,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.454988e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.518822e+01 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.432444e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.469802e+01 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -209,9 +209,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.169e-06 [1.1693100945435827E-006] fbridge_mode=1 [UNWEIGHT] Wrote 14 events (found 457 events) - [COUNTERS] PROGRAM TOTAL : 108.7414s - [COUNTERS] Fortran Overhead ( 0 ) : 50.4501s - [COUNTERS] CudaCpp MEs ( 2 ) : 58.2913s for 8192 events => throughput is 1.41E+02 events/s + [COUNTERS] PROGRAM TOTAL : 109.0441s + [COUNTERS] Fortran Overhead ( 0 ) : 50.2199s + [COUNTERS] CudaCpp MEs ( 2 ) : 58.8242s for 8192 events => throughput is 1.39E+02 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -242,9 +242,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.136e-07 [2.1358436158813958E-007] fbridge_mode=1 [UNWEIGHT] Wrote 84 events (found 1181 events) - [COUNTERS] PROGRAM TOTAL : 696.6832s - [COUNTERS] Fortran Overhead ( 0 ) : 54.0281s - [COUNTERS] CudaCpp MEs ( 2 ) : 642.6552s for 90112 events => throughput is 1.40E+02 events/s + [COUNTERS] PROGRAM TOTAL : 705.3776s + [COUNTERS] Fortran Overhead ( 0 ) : 53.9344s + [COUNTERS] CudaCpp MEs ( 2 ) : 651.4432s for 90112 events => throughput is 1.38E+02 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -257,12 +257,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.650580e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.640675e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.649235e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.645178e+02 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -285,9 +285,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.169e-06 [1.1693100945435829E-006] fbridge_mode=1 [UNWEIGHT] Wrote 14 events (found 457 events) - [COUNTERS] PROGRAM TOTAL : 51.6414s - [COUNTERS] Fortran Overhead ( 0 ) : 23.7804s - [COUNTERS] CudaCpp MEs ( 2 ) : 27.8610s for 8192 events => throughput is 2.94E+02 events/s + [COUNTERS] PROGRAM TOTAL : 51.2174s + [COUNTERS] Fortran Overhead ( 0 ) : 23.3441s + [COUNTERS] CudaCpp MEs ( 2 ) : 27.8733s for 8192 events => throughput is 2.94E+02 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -318,9 +318,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.136e-07 [2.1358436158813958E-007] fbridge_mode=1 [UNWEIGHT] Wrote 84 events (found 1181 events) - [COUNTERS] PROGRAM TOTAL : 336.0637s - [COUNTERS] Fortran Overhead ( 0 ) : 27.3859s - [COUNTERS] CudaCpp MEs ( 2 ) : 308.6777s for 90112 events => throughput is 2.92E+02 events/s + [COUNTERS] PROGRAM TOTAL : 332.6731s + [COUNTERS] Fortran Overhead ( 0 ) : 26.9170s + [COUNTERS] CudaCpp MEs ( 2 ) : 305.7561s for 90112 events => throughput is 2.95E+02 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -333,12 +333,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.526467e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.549919e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.522582e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.597201e+02 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -361,9 +361,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.169e-06 [1.1693100945435829E-006] fbridge_mode=1 [UNWEIGHT] Wrote 14 events (found 457 events) - [COUNTERS] PROGRAM TOTAL : 45.7116s - [COUNTERS] Fortran Overhead ( 0 ) : 20.7382s - [COUNTERS] CudaCpp MEs ( 2 ) : 24.9733s for 8192 events => throughput is 3.28E+02 events/s + [COUNTERS] PROGRAM TOTAL : 45.3753s + [COUNTERS] Fortran Overhead ( 0 ) : 20.9676s + [COUNTERS] CudaCpp MEs ( 2 ) : 24.4077s for 8192 events => throughput is 3.36E+02 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -394,9 +394,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.136e-07 [2.1358436158813958E-007] fbridge_mode=1 [UNWEIGHT] Wrote 84 events (found 1181 events) - [COUNTERS] PROGRAM TOTAL : 300.9525s - [COUNTERS] Fortran Overhead ( 0 ) : 24.6064s - [COUNTERS] CudaCpp MEs ( 2 ) : 276.3461s for 90112 events => throughput is 3.26E+02 events/s + [COUNTERS] PROGRAM TOTAL : 294.5042s + [COUNTERS] Fortran Overhead ( 0 ) : 24.4920s + [COUNTERS] CudaCpp MEs ( 2 ) : 270.0121s for 90112 events => throughput is 3.34E+02 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -409,12 +409,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.996392e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.021260e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.018122e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.013283e+02 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -437,9 +437,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.169e-06 [1.1693100945435829E-006] fbridge_mode=1 [UNWEIGHT] Wrote 14 events (found 457 events) - [COUNTERS] PROGRAM TOTAL : 46.0464s - [COUNTERS] Fortran Overhead ( 0 ) : 22.3585s - [COUNTERS] CudaCpp MEs ( 2 ) : 23.6879s for 8192 events => throughput is 3.46E+02 events/s + [COUNTERS] PROGRAM TOTAL : 46.3863s + [COUNTERS] Fortran Overhead ( 0 ) : 22.5070s + [COUNTERS] CudaCpp MEs ( 2 ) : 23.8792s for 8192 events => throughput is 3.43E+02 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -470,9 +470,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.136e-07 [2.1358436158813958E-007] fbridge_mode=1 [UNWEIGHT] Wrote 84 events (found 1181 events) - [COUNTERS] PROGRAM TOTAL : 290.7705s - [COUNTERS] Fortran Overhead ( 0 ) : 26.0974s - [COUNTERS] CudaCpp MEs ( 2 ) : 264.6732s for 90112 events => throughput is 3.40E+02 events/s + [COUNTERS] PROGRAM TOTAL : 289.0610s + [COUNTERS] Fortran Overhead ( 0 ) : 26.2218s + [COUNTERS] CudaCpp MEs ( 2 ) : 262.8391s for 90112 events => throughput is 3.43E+02 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -485,12 +485,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.727020e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.740541e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.724799e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.745479e+02 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -513,9 +513,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.169e-06 [1.1693100945435838E-006] fbridge_mode=1 [UNWEIGHT] Wrote 14 events (found 457 events) - [COUNTERS] PROGRAM TOTAL : 4.2713s - [COUNTERS] Fortran Overhead ( 0 ) : 3.1897s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.0816s for 8192 events => throughput is 7.57E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.2951s + [COUNTERS] Fortran Overhead ( 0 ) : 3.2087s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.0864s for 8192 events => throughput is 7.54E+03 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -546,9 +546,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.136e-07 [2.1358436158813958E-007] fbridge_mode=1 [UNWEIGHT] Wrote 84 events (found 1181 events) - [COUNTERS] PROGRAM TOTAL : 18.7030s - [COUNTERS] Fortran Overhead ( 0 ) : 6.7629s - [COUNTERS] CudaCpp MEs ( 2 ) : 11.9401s for 90112 events => throughput is 7.55E+03 events/s + [COUNTERS] PROGRAM TOTAL : 18.7118s + [COUNTERS] Fortran Overhead ( 0 ) : 6.7622s + [COUNTERS] CudaCpp MEs ( 2 ) : 11.9496s for 90112 events => throughput is 7.54E+03 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** @@ -561,41 +561,41 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.552375e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.487117e+03 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.238010e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.224789e+03 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 512 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.259744e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.215613e+03 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 512 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.543703e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.512340e+03 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.221689e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.227523e+03 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.406762e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.402144e+03 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.295029e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.254965e+03 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.251320e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.253935e+03 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt index e2b78c550f..c7c6154514 100644 --- a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt @@ -2,30 +2,30 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/g CUDACPP_BUILDDIR='.' - make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 make USEBUILDDIR=1 AVX=avx2 -make USEBUILDDIR=1 AVX=512y make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' + +make USEBUILDDIR=1 AVX=512y make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512z make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. @@ -33,7 +33,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2023-07-20_23:42:26 +DATE: 2023-06-17_01:08:35 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg @@ -58,9 +58,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/a [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.169e-06 [] fbridge_mode=0 - [COUNTERS] PROGRAM TOTAL : 96.7838s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4145s - [COUNTERS] Fortran MEs ( 1 ) : 96.3693s for 8192 events => throughput is 8.50E+01 events/s + [COUNTERS] PROGRAM TOTAL : 95.9433s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4120s + [COUNTERS] Fortran MEs ( 1 ) : 95.5312s for 8192 events => throughput is 8.58E+01 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,9 +83,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/a [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.169e-06 [1.1693100945435808E-006] fbridge_mode=0 [UNWEIGHT] Wrote 14 events (found 457 events) - [COUNTERS] PROGRAM TOTAL : 96.7621s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4794s - [COUNTERS] Fortran MEs ( 1 ) : 96.2827s for 8192 events => throughput is 8.51E+01 events/s + [COUNTERS] PROGRAM TOTAL : 96.0818s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4708s + [COUNTERS] Fortran MEs ( 1 ) : 95.6110s for 8192 events => throughput is 8.57E+01 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -108,9 +108,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x10_fortran > /tmp/ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.136e-07 [2.1358436158813976E-007] fbridge_mode=0 [UNWEIGHT] Wrote 84 events (found 1181 events) - [COUNTERS] PROGRAM TOTAL : 1063.3107s - [COUNTERS] Fortran Overhead ( 0 ) : 4.0709s - [COUNTERS] Fortran MEs ( 1 ) : 1059.2397s for 90112 events => throughput is 8.51E+01 events/s + [COUNTERS] PROGRAM TOTAL : 1056.4460s + [COUNTERS] Fortran Overhead ( 0 ) : 4.0785s + [COUNTERS] Fortran MEs ( 1 ) : 1052.3676s for 90112 events => throughput is 8.56E+01 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -133,9 +133,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.169e-06 [1.1694768395608941E-006] fbridge_mode=1 [UNWEIGHT] Wrote 14 events (found 457 events) - [COUNTERS] PROGRAM TOTAL : 203.3226s - [COUNTERS] Fortran Overhead ( 0 ) : 93.0120s - [COUNTERS] CudaCpp MEs ( 2 ) : 110.3106s for 8192 events => throughput is 7.43E+01 events/s + [COUNTERS] PROGRAM TOTAL : 201.5191s + [COUNTERS] Fortran Overhead ( 0 ) : 92.3001s + [COUNTERS] CudaCpp MEs ( 2 ) : 109.2190s for 8192 events => throughput is 7.50E+01 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -166,9 +166,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.136e-07 [2.1361436148187123E-007] fbridge_mode=1 [UNWEIGHT] Wrote 84 events (found 1181 events) - [COUNTERS] PROGRAM TOTAL : 1318.2776s - [COUNTERS] Fortran Overhead ( 0 ) : 96.5166s - [COUNTERS] CudaCpp MEs ( 2 ) : 1221.7610s for 90112 events => throughput is 7.38E+01 events/s + [COUNTERS] PROGRAM TOTAL : 1305.0132s + [COUNTERS] Fortran Overhead ( 0 ) : 96.3805s + [COUNTERS] CudaCpp MEs ( 2 ) : 1208.6327s for 90112 events => throughput is 7.46E+01 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -181,12 +181,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.865804e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.869030e+01 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.875022e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.908799e+01 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -209,9 +209,9 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.169e-06 [1.1694766634537254E-006] fbridge_mode=1 [UNWEIGHT] Wrote 14 events (found 457 events) - [COUNTERS] PROGRAM TOTAL : 48.9583s - [COUNTERS] Fortran Overhead ( 0 ) : 23.0233s - [COUNTERS] CudaCpp MEs ( 2 ) : 25.9350s for 8192 events => throughput is 3.16E+02 events/s + [COUNTERS] PROGRAM TOTAL : 48.8581s + [COUNTERS] Fortran Overhead ( 0 ) : 22.8819s + [COUNTERS] CudaCpp MEs ( 2 ) : 25.9762s for 8192 events => throughput is 3.15E+02 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -242,9 +242,9 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.136e-07 [2.1361435622518579E-007] fbridge_mode=1 [UNWEIGHT] Wrote 84 events (found 1181 events) - [COUNTERS] PROGRAM TOTAL : 313.8979s - [COUNTERS] Fortran Overhead ( 0 ) : 26.7375s - [COUNTERS] CudaCpp MEs ( 2 ) : 287.1604s for 90112 events => throughput is 3.14E+02 events/s + [COUNTERS] PROGRAM TOTAL : 311.8489s + [COUNTERS] Fortran Overhead ( 0 ) : 26.4793s + [COUNTERS] CudaCpp MEs ( 2 ) : 285.3696s for 90112 events => throughput is 3.16E+02 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -257,12 +257,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.642412e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.654834e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.633616e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.636170e+02 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -285,9 +285,9 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.169e-06 [1.1694765364749936E-006] fbridge_mode=1 [UNWEIGHT] Wrote 14 events (found 457 events) - [COUNTERS] PROGRAM TOTAL : 26.0413s - [COUNTERS] Fortran Overhead ( 0 ) : 12.1196s - [COUNTERS] CudaCpp MEs ( 2 ) : 13.9216s for 8192 events => throughput is 5.88E+02 events/s + [COUNTERS] PROGRAM TOTAL : 25.9178s + [COUNTERS] Fortran Overhead ( 0 ) : 12.1541s + [COUNTERS] CudaCpp MEs ( 2 ) : 13.7637s for 8192 events => throughput is 5.95E+02 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -318,9 +318,9 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.136e-07 [2.1361435955979457E-007] fbridge_mode=1 [UNWEIGHT] Wrote 84 events (found 1181 events) - [COUNTERS] PROGRAM TOTAL : 169.6116s - [COUNTERS] Fortran Overhead ( 0 ) : 15.7325s - [COUNTERS] CudaCpp MEs ( 2 ) : 153.8790s for 90112 events => throughput is 5.86E+02 events/s + [COUNTERS] PROGRAM TOTAL : 167.6069s + [COUNTERS] Fortran Overhead ( 0 ) : 15.7359s + [COUNTERS] CudaCpp MEs ( 2 ) : 151.8710s for 90112 events => throughput is 5.93E+02 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -333,12 +333,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.975053e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.068292e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.057112e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.077191e+02 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -361,9 +361,9 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.169e-06 [1.1694765364749936E-006] fbridge_mode=1 [UNWEIGHT] Wrote 14 events (found 457 events) - [COUNTERS] PROGRAM TOTAL : 23.2323s - [COUNTERS] Fortran Overhead ( 0 ) : 10.7579s - [COUNTERS] CudaCpp MEs ( 2 ) : 12.4744s for 8192 events => throughput is 6.57E+02 events/s + [COUNTERS] PROGRAM TOTAL : 23.2564s + [COUNTERS] Fortran Overhead ( 0 ) : 10.7772s + [COUNTERS] CudaCpp MEs ( 2 ) : 12.4793s for 8192 events => throughput is 6.56E+02 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -394,9 +394,9 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.136e-07 [2.1361435955979457E-007] fbridge_mode=1 [UNWEIGHT] Wrote 84 events (found 1181 events) - [COUNTERS] PROGRAM TOTAL : 151.1313s - [COUNTERS] Fortran Overhead ( 0 ) : 14.2784s - [COUNTERS] CudaCpp MEs ( 2 ) : 136.8530s for 90112 events => throughput is 6.58E+02 events/s + [COUNTERS] PROGRAM TOTAL : 152.0499s + [COUNTERS] Fortran Overhead ( 0 ) : 14.4286s + [COUNTERS] CudaCpp MEs ( 2 ) : 137.6212s for 90112 events => throughput is 6.55E+02 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -409,12 +409,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.028040e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.965515e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.994508e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.954223e+02 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -437,9 +437,9 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.169e-06 [1.1694767893082863E-006] fbridge_mode=1 [UNWEIGHT] Wrote 14 events (found 457 events) - [COUNTERS] PROGRAM TOTAL : 23.6719s - [COUNTERS] Fortran Overhead ( 0 ) : 11.5944s - [COUNTERS] CudaCpp MEs ( 2 ) : 12.0775s for 8192 events => throughput is 6.78E+02 events/s + [COUNTERS] PROGRAM TOTAL : 23.5866s + [COUNTERS] Fortran Overhead ( 0 ) : 11.6220s + [COUNTERS] CudaCpp MEs ( 2 ) : 11.9645s for 8192 events => throughput is 6.85E+02 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -470,9 +470,9 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.136e-07 [2.1361441834174529E-007] fbridge_mode=1 [UNWEIGHT] Wrote 84 events (found 1181 events) - [COUNTERS] PROGRAM TOTAL : 145.3129s - [COUNTERS] Fortran Overhead ( 0 ) : 15.1829s - [COUNTERS] CudaCpp MEs ( 2 ) : 130.1300s for 90112 events => throughput is 6.92E+02 events/s + [COUNTERS] PROGRAM TOTAL : 145.9255s + [COUNTERS] Fortran Overhead ( 0 ) : 15.0340s + [COUNTERS] CudaCpp MEs ( 2 ) : 130.8914s for 90112 events => throughput is 6.88E+02 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -485,12 +485,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.456772e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.533944e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.395401e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.436083e+02 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -513,9 +513,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.169e-06 [1.1694770708195000E-006] fbridge_mode=1 [UNWEIGHT] Wrote 14 events (found 457 events) - [COUNTERS] PROGRAM TOTAL : 2.5194s - [COUNTERS] Fortran Overhead ( 0 ) : 2.0212s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.4982s for 8192 events => throughput is 1.64E+04 events/s + [COUNTERS] PROGRAM TOTAL : 2.5187s + [COUNTERS] Fortran Overhead ( 0 ) : 2.0274s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4913s for 8192 events => throughput is 1.67E+04 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -546,9 +546,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.136e-07 [2.1361443477565659E-007] fbridge_mode=1 [UNWEIGHT] Wrote 84 events (found 1181 events) - [COUNTERS] PROGRAM TOTAL : 11.0644s - [COUNTERS] Fortran Overhead ( 0 ) : 5.5950s - [COUNTERS] CudaCpp MEs ( 2 ) : 5.4693s for 90112 events => throughput is 1.65E+04 events/s + [COUNTERS] PROGRAM TOTAL : 11.0853s + [COUNTERS] Fortran Overhead ( 0 ) : 5.6057s + [COUNTERS] CudaCpp MEs ( 2 ) : 5.4795s for 90112 events => throughput is 1.64E+04 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** @@ -561,41 +561,41 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.640849e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.640525e+04 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.625699e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.649395e+04 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 512 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.310677e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.325104e+04 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 512 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.446787e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.391765e+04 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.291626e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.322841e+04 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.329119e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.376800e+04 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.322445e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.338034e+04 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.458808e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.503445e+03 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt index af1a7875aa..40600c6dee 100644 --- a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt @@ -2,8 +2,8 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/g CUDACPP_BUILDDIR='.' -make USEBUILDDIR=1 AVX=none +make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 make USEBUILDDIR=1 AVX=avx2 @@ -17,8 +17,8 @@ make USEBUILDDIR=1 AVX=512z make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. @@ -33,7 +33,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2023-07-21_00:49:22 +DATE: 2023-06-17_02:15:04 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg @@ -58,9 +58,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/a [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.169e-06 [] fbridge_mode=0 - [COUNTERS] PROGRAM TOTAL : 96.3904s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4118s - [COUNTERS] Fortran MEs ( 1 ) : 95.9786s for 8192 events => throughput is 8.54E+01 events/s + [COUNTERS] PROGRAM TOTAL : 96.1609s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4132s + [COUNTERS] Fortran MEs ( 1 ) : 95.7477s for 8192 events => throughput is 8.56E+01 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -83,9 +83,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/a [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.169e-06 [1.1693100945435808E-006] fbridge_mode=0 [UNWEIGHT] Wrote 14 events (found 457 events) - [COUNTERS] PROGRAM TOTAL : 96.5751s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4733s - [COUNTERS] Fortran MEs ( 1 ) : 96.1018s for 8192 events => throughput is 8.52E+01 events/s + [COUNTERS] PROGRAM TOTAL : 96.6436s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4751s + [COUNTERS] Fortran MEs ( 1 ) : 96.1685s for 8192 events => throughput is 8.52E+01 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -108,9 +108,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x10_fortran > /tmp/ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.136e-07 [2.1358436158813976E-007] fbridge_mode=0 [UNWEIGHT] Wrote 84 events (found 1181 events) - [COUNTERS] PROGRAM TOTAL : 1060.8092s - [COUNTERS] Fortran Overhead ( 0 ) : 4.0518s - [COUNTERS] Fortran MEs ( 1 ) : 1056.7574s for 90112 events => throughput is 8.53E+01 events/s + [COUNTERS] PROGRAM TOTAL : 1059.8674s + [COUNTERS] Fortran Overhead ( 0 ) : 4.0491s + [COUNTERS] Fortran MEs ( 1 ) : 1055.8184s for 90112 events => throughput is 8.53E+01 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -133,9 +133,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.169e-06 [1.1693101016896846E-006] fbridge_mode=1 [UNWEIGHT] Wrote 14 events (found 457 events) - [COUNTERS] PROGRAM TOTAL : 216.7386s - [COUNTERS] Fortran Overhead ( 0 ) : 98.6629s - [COUNTERS] CudaCpp MEs ( 2 ) : 118.0758s for 8192 events => throughput is 6.94E+01 events/s + [COUNTERS] PROGRAM TOTAL : 211.4841s + [COUNTERS] Fortran Overhead ( 0 ) : 97.1384s + [COUNTERS] CudaCpp MEs ( 2 ) : 114.3457s for 8192 events => throughput is 7.16E+01 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -166,9 +166,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.136e-07 [2.1358436275882778E-007] fbridge_mode=1 [UNWEIGHT] Wrote 84 events (found 1181 events) - [COUNTERS] PROGRAM TOTAL : 1387.3131s - [COUNTERS] Fortran Overhead ( 0 ) : 102.2094s - [COUNTERS] CudaCpp MEs ( 2 ) : 1285.1038s for 90112 events => throughput is 7.01E+01 events/s + [COUNTERS] PROGRAM TOTAL : 1372.8556s + [COUNTERS] Fortran Overhead ( 0 ) : 101.8141s + [COUNTERS] CudaCpp MEs ( 2 ) : 1271.0415s for 90112 events => throughput is 7.09E+01 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -181,12 +181,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.374378e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.162213e+01 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.380253e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.331097e+01 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -209,9 +209,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.169e-06 [1.1693101020910778E-006] fbridge_mode=1 [UNWEIGHT] Wrote 14 events (found 457 events) - [COUNTERS] PROGRAM TOTAL : 111.4582s - [COUNTERS] Fortran Overhead ( 0 ) : 50.9549s - [COUNTERS] CudaCpp MEs ( 2 ) : 60.5033s for 8192 events => throughput is 1.35E+02 events/s + [COUNTERS] PROGRAM TOTAL : 110.6811s + [COUNTERS] Fortran Overhead ( 0 ) : 51.3766s + [COUNTERS] CudaCpp MEs ( 2 ) : 59.3045s for 8192 events => throughput is 1.38E+02 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -242,9 +242,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.136e-07 [2.1358436284111598E-007] fbridge_mode=1 [UNWEIGHT] Wrote 84 events (found 1181 events) - [COUNTERS] PROGRAM TOTAL : 721.2943s - [COUNTERS] Fortran Overhead ( 0 ) : 54.5037s - [COUNTERS] CudaCpp MEs ( 2 ) : 666.7906s for 90112 events => throughput is 1.35E+02 events/s + [COUNTERS] PROGRAM TOTAL : 688.7864s + [COUNTERS] Fortran Overhead ( 0 ) : 54.7130s + [COUNTERS] CudaCpp MEs ( 2 ) : 634.0734s for 90112 events => throughput is 1.42E+02 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -257,12 +257,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.618624e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.616861e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.620246e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.616649e+02 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -285,9 +285,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.169e-06 [1.1693101021831071E-006] fbridge_mode=1 [UNWEIGHT] Wrote 14 events (found 457 events) - [COUNTERS] PROGRAM TOTAL : 49.7149s - [COUNTERS] Fortran Overhead ( 0 ) : 22.6636s - [COUNTERS] CudaCpp MEs ( 2 ) : 27.0513s for 8192 events => throughput is 3.03E+02 events/s + [COUNTERS] PROGRAM TOTAL : 49.6391s + [COUNTERS] Fortran Overhead ( 0 ) : 22.3277s + [COUNTERS] CudaCpp MEs ( 2 ) : 27.3114s for 8192 events => throughput is 3.00E+02 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -318,9 +318,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.136e-07 [2.1358436281462142E-007] fbridge_mode=1 [UNWEIGHT] Wrote 84 events (found 1181 events) - [COUNTERS] PROGRAM TOTAL : 322.8981s - [COUNTERS] Fortran Overhead ( 0 ) : 26.4448s - [COUNTERS] CudaCpp MEs ( 2 ) : 296.4533s for 90112 events => throughput is 3.04E+02 events/s + [COUNTERS] PROGRAM TOTAL : 327.4154s + [COUNTERS] Fortran Overhead ( 0 ) : 25.8163s + [COUNTERS] CudaCpp MEs ( 2 ) : 301.5990s for 90112 events => throughput is 2.99E+02 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -333,12 +333,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.676587e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.748239e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.671497e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.733522e+02 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -361,9 +361,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.169e-06 [1.1693101021831071E-006] fbridge_mode=1 [UNWEIGHT] Wrote 14 events (found 457 events) - [COUNTERS] PROGRAM TOTAL : 43.7631s - [COUNTERS] Fortran Overhead ( 0 ) : 19.8887s - [COUNTERS] CudaCpp MEs ( 2 ) : 23.8744s for 8192 events => throughput is 3.43E+02 events/s + [COUNTERS] PROGRAM TOTAL : 43.4217s + [COUNTERS] Fortran Overhead ( 0 ) : 19.9092s + [COUNTERS] CudaCpp MEs ( 2 ) : 23.5125s for 8192 events => throughput is 3.48E+02 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -394,9 +394,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.136e-07 [2.1358436281462142E-007] fbridge_mode=1 [UNWEIGHT] Wrote 84 events (found 1181 events) - [COUNTERS] PROGRAM TOTAL : 284.0151s - [COUNTERS] Fortran Overhead ( 0 ) : 23.3057s - [COUNTERS] CudaCpp MEs ( 2 ) : 260.7094s for 90112 events => throughput is 3.46E+02 events/s + [COUNTERS] PROGRAM TOTAL : 282.1566s + [COUNTERS] Fortran Overhead ( 0 ) : 23.4368s + [COUNTERS] CudaCpp MEs ( 2 ) : 258.7198s for 90112 events => throughput is 3.48E+02 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -409,12 +409,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.247220e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.246539e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.249990e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.237720e+02 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -437,9 +437,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.169e-06 [1.1693101021831071E-006] fbridge_mode=1 [UNWEIGHT] Wrote 14 events (found 457 events) - [COUNTERS] PROGRAM TOTAL : 45.5793s - [COUNTERS] Fortran Overhead ( 0 ) : 22.0503s - [COUNTERS] CudaCpp MEs ( 2 ) : 23.5291s for 8192 events => throughput is 3.48E+02 events/s + [COUNTERS] PROGRAM TOTAL : 45.4934s + [COUNTERS] Fortran Overhead ( 0 ) : 21.9129s + [COUNTERS] CudaCpp MEs ( 2 ) : 23.5805s for 8192 events => throughput is 3.47E+02 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -470,9 +470,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.136e-07 [2.1358436281462142E-007] fbridge_mode=1 [UNWEIGHT] Wrote 84 events (found 1181 events) - [COUNTERS] PROGRAM TOTAL : 282.8288s - [COUNTERS] Fortran Overhead ( 0 ) : 25.3919s - [COUNTERS] CudaCpp MEs ( 2 ) : 257.4370s for 90112 events => throughput is 3.50E+02 events/s + [COUNTERS] PROGRAM TOTAL : 285.7173s + [COUNTERS] Fortran Overhead ( 0 ) : 25.4131s + [COUNTERS] CudaCpp MEs ( 2 ) : 260.3043s for 90112 events => throughput is 3.46E+02 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -485,12 +485,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.817081e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.849415e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.830491e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.819185e+02 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -513,9 +513,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.169e-06 [1.1693100942770687E-006] fbridge_mode=1 [UNWEIGHT] Wrote 14 events (found 457 events) - [COUNTERS] PROGRAM TOTAL : 3.6444s - [COUNTERS] Fortran Overhead ( 0 ) : 2.7798s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.8646s for 8192 events => throughput is 9.47E+03 events/s + [COUNTERS] PROGRAM TOTAL : 3.6199s + [COUNTERS] Fortran Overhead ( 0 ) : 2.7558s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.8641s for 8192 events => throughput is 9.48E+03 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -546,9 +546,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.136e-07 [2.1358436157495368E-007] fbridge_mode=1 [UNWEIGHT] Wrote 84 events (found 1181 events) - [COUNTERS] PROGRAM TOTAL : 15.8203s - [COUNTERS] Fortran Overhead ( 0 ) : 6.3107s - [COUNTERS] CudaCpp MEs ( 2 ) : 9.5095s for 90112 events => throughput is 9.48E+03 events/s + [COUNTERS] PROGRAM TOTAL : 15.8257s + [COUNTERS] Fortran Overhead ( 0 ) : 6.3170s + [COUNTERS] CudaCpp MEs ( 2 ) : 9.5087s for 90112 events => throughput is 9.48E+03 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** @@ -561,41 +561,41 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.416564e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.405516e+03 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.086300e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.081261e+04 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 512 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.112982e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.109602e+04 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 512 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.163767e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.160351e+04 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.109442e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.107621e+04 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.111792e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.111526e+04 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.108274e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.109298e+04 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.663594e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.655752e+03 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt index 9db097681f..e655c4eb6e 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt @@ -35,7 +35,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2023-07-20_21:02:35 +DATE: 2023-07-20_17:34:37 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -43,14 +43,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProces Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.529465e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.968737e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.317167e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.496716e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.822116e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.807140e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 0.987540 sec - 2,857,079,236 cycles # 2.897 GHz - 3,965,914,902 instructions # 1.39 insn per cycle - 1.555641805 seconds time elapsed +TOTAL : 0.730366 sec + 2,835,972,917 cycles # 2.916 GHz + 4,070,940,598 instructions # 1.44 insn per cycle + 1.031386667 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 150 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -68,14 +68,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.204828e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.491015e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.491015e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.179679e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.470366e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.470366e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 5.625405 sec - 17,427,533,067 cycles # 3.096 GHz - 41,065,953,663 instructions # 2.36 insn per cycle - 5.672995071 seconds time elapsed +TOTAL : 5.774897 sec + 17,777,594,761 cycles # 3.074 GHz + 41,158,036,844 instructions # 2.32 insn per cycle + 5.787940703 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 375) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe @@ -94,14 +94,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.041628e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.127901e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.127901e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.988364e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.081546e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.081546e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.487088 sec - 10,698,042,835 cycles # 3.066 GHz - 25,327,736,951 instructions # 2.37 insn per cycle - 3.672079156 seconds time elapsed +TOTAL : 3.606987 sec + 11,101,757,126 cycles # 3.069 GHz + 25,419,894,046 instructions # 2.29 insn per cycle + 3.620080819 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1283) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe @@ -120,14 +120,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.976301e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.896673e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.896673e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.910924e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.953750e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.953750e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 2.512323 sec - 7,490,301,994 cycles # 2.976 GHz - 14,325,359,476 instructions # 1.91 insn per cycle - 2.688081818 seconds time elapsed +TOTAL : 2.602997 sec + 7,774,557,603 cycles # 2.976 GHz + 14,415,609,848 instructions # 1.85 insn per cycle + 2.621766479 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1063) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe @@ -146,14 +146,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.075263e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.352012e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.352012e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.020624e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.316933e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.316933e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 2.438849 sec - 7,298,837,690 cycles # 2.988 GHz - 14,031,918,582 instructions # 1.92 insn per cycle - 2.656178218 seconds time elapsed +TOTAL : 2.519460 sec + 7,530,977,350 cycles # 2.975 GHz + 14,122,388,495 instructions # 1.88 insn per cycle + 2.539541776 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1024) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest.exe @@ -172,14 +172,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.826813e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.472040e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.472040e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.782890e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.407831e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.407831e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 2.634307 sec - 6,540,523,246 cycles # 2.480 GHz - 10,814,026,157 instructions # 1.65 insn per cycle - 2.710932662 seconds time elapsed +TOTAL : 2.705992 sec + 6,844,449,594 cycles # 2.519 GHz + 10,905,997,050 instructions # 1.59 insn per cycle + 2.729302546 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 289) (512y: 0) (512z: 683) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt index 13d9e715fe..413524a714 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt @@ -35,7 +35,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2023-07-20_21:51:57 +DATE: 2023-06-16_23:18:49 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -47,14 +47,14 @@ WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublo Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.937526e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.494601e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.494601e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.138466e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.761625e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.761625e+07 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 2.523007 sec - 8,214,545,539 cycles # 2.952 GHz - 13,760,667,099 instructions # 1.68 insn per cycle - 2.839776552 seconds time elapsed +TOTAL : 2.435025 sec + 8,057,144,700 cycles # 2.996 GHz + 13,634,238,127 instructions # 1.69 insn per cycle + 2.748108130 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost @@ -77,14 +77,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.157655e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.419412e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.419412e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.146185e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.405096e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.405096e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 6.045954 sec - 18,725,115,815 cycles # 3.095 GHz - 41,378,617,025 instructions # 2.21 insn per cycle - 6.053626333 seconds time elapsed +TOTAL : 6.106864 sec + 18,682,647,016 cycles # 3.057 GHz + 41,378,608,262 instructions # 2.21 insn per cycle + 6.114253847 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 375) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe @@ -104,14 +104,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.905264e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.823745e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.823745e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.923353e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.849109e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.849109e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.904171 sec - 11,953,469,054 cycles # 3.058 GHz - 26,176,916,495 instructions # 2.19 insn per cycle - 3.918091679 seconds time elapsed +TOTAL : 3.885345 sec + 11,937,212,794 cycles # 3.069 GHz + 26,176,863,335 instructions # 2.19 insn per cycle + 3.901575111 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1283) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe @@ -131,14 +131,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.717815e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.930947e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.930947e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.738552e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.036567e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.036567e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 2.922782 sec - 8,772,089,445 cycles # 2.995 GHz - 15,690,142,374 instructions # 1.79 insn per cycle - 2.936115188 seconds time elapsed +TOTAL : 2.911902 sec + 8,807,623,027 cycles # 3.018 GHz + 15,689,801,673 instructions # 1.78 insn per cycle + 2.919670444 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1063) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe @@ -158,14 +158,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.808530e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.241997e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.241997e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.823116e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.286837e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.286837e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 2.848706 sec - 8,497,417,491 cycles # 2.976 GHz - 15,397,129,647 instructions # 1.81 insn per cycle - 2.856960353 seconds time elapsed +TOTAL : 2.843015 sec + 8,509,599,499 cycles # 2.988 GHz + 15,397,182,071 instructions # 1.81 insn per cycle + 2.850349634 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1024) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest.exe @@ -185,14 +185,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.581473e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.594178e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.594178e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.546825e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.539933e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.539933e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.066530 sec - 7,907,709,484 cycles # 2.577 GHz - 11,966,197,304 instructions # 1.51 insn per cycle - 3.079832339 seconds time elapsed +TOTAL : 3.102839 sec + 7,964,585,861 cycles # 2.565 GHz + 11,966,072,331 instructions # 1.50 insn per cycle + 3.117079564 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 289) (512y: 0) (512z: 683) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt index 52eee6658e..f307a0f66d 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt @@ -35,7 +35,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2023-07-20_22:04:21 +DATE: 2023-06-16_23:31:11 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -43,14 +43,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProces Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.553595e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.165015e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.664709e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.936041e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.298657e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.700721e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 1.351777 sec - 4,751,582,983 cycles # 2.985 GHz - 6,956,966,068 instructions # 1.46 insn per cycle - 1.648673871 seconds time elapsed +TOTAL : 1.341897 sec + 4,726,189,877 cycles # 2.972 GHz + 6,969,421,973 instructions # 1.47 insn per cycle + 1.646460673 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --common ==PROF== Profiling "sigmaKin": launch__registers_per_thread 150 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -68,14 +68,14 @@ Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.204000e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.490109e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.490109e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.210740e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.496812e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.496812e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 5.985883 sec - 18,522,424,622 cycles # 3.093 GHz - 41,194,396,088 instructions # 2.22 insn per cycle - 5.992469097 seconds time elapsed +TOTAL : 5.955030 sec + 18,512,766,741 cycles # 3.107 GHz + 41,194,403,666 instructions # 2.23 insn per cycle + 5.961394817 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 375) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe @@ -94,14 +94,14 @@ Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.050166e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.149024e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.149024e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.047034e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.131905e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.131905e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 3.825489 sec - 11,790,939,975 cycles # 3.080 GHz - 25,354,938,210 instructions # 2.15 insn per cycle - 3.831996805 seconds time elapsed +TOTAL : 3.837203 sec + 11,777,572,402 cycles # 3.067 GHz + 25,355,656,397 instructions # 2.15 insn per cycle + 3.848753687 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1283) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe @@ -120,14 +120,14 @@ Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.968102e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.916412e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.916412e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.018909e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.963202e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.963202e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 2.871349 sec - 8,634,298,238 cycles # 3.003 GHz - 14,249,713,160 instructions # 1.65 insn per cycle - 2.888950821 seconds time elapsed +TOTAL : 2.834606 sec + 8,590,828,376 cycles # 3.027 GHz + 14,249,676,242 instructions # 1.66 insn per cycle + 2.846836909 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1063) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe @@ -146,14 +146,14 @@ Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.092837e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.305381e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.305381e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.112124e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.390077e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.390077e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 2.793351 sec - 8,400,301,090 cycles # 3.003 GHz - 13,755,514,422 instructions # 1.64 insn per cycle - 2.808663540 seconds time elapsed +TOTAL : 2.779947 sec + 8,413,366,977 cycles # 3.021 GHz + 13,755,530,918 instructions # 1.63 insn per cycle + 2.792283352 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1024) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest.exe @@ -172,14 +172,14 @@ Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.894267e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.666552e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.666552e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.941539e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.752941e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.752941e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 2.934585 sec - 7,771,597,304 cycles # 2.645 GHz - 10,538,658,309 instructions # 1.36 insn per cycle - 2.940717040 seconds time elapsed +TOTAL : 2.901859 sec + 7,689,553,685 cycles # 2.645 GHz + 10,538,700,131 instructions # 1.37 insn per cycle + 2.908219333 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 289) (512y: 0) (512z: 683) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_curhst.txt index d808f0d451..c1df1cffcd 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_curhst.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_curhst.txt @@ -35,7 +35,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2023-07-20_22:01:23 +DATE: 2023-06-16_23:28:12 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -43,14 +43,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProces Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.577511e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.207519e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.723156e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.961494e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.330926e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.727946e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 1.000385 sec - 3,654,644,889 cycles # 2.930 GHz - 6,903,002,204 instructions # 1.89 insn per cycle - 1.304605996 seconds time elapsed +TOTAL : 0.991651 sec + 3,630,538,921 cycles # 2.949 GHz + 6,739,620,599 instructions # 1.86 insn per cycle + 1.288650450 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --curhst ==PROF== Profiling "sigmaKin": launch__registers_per_thread 150 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -68,14 +68,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.202275e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.486265e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.486265e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.207504e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.492388e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.492388e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 5.633591 sec - 17,385,661,264 cycles # 3.086 GHz - 41,064,253,107 instructions # 2.36 insn per cycle - 5.640000913 seconds time elapsed +TOTAL : 5.609999 sec + 17,377,959,437 cycles # 3.096 GHz + 41,067,120,156 instructions # 2.36 insn per cycle + 5.616601968 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 375) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe @@ -94,14 +94,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.068819e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.173145e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.173145e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.061159e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.155823e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.155823e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.439204 sec - 10,666,865,410 cycles # 3.097 GHz - 25,328,206,480 instructions # 2.37 insn per cycle - 3.451870304 seconds time elapsed +TOTAL : 3.452926 sec + 10,661,248,447 cycles # 3.085 GHz + 25,328,629,608 instructions # 2.38 insn per cycle + 3.465344366 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1283) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe @@ -120,14 +120,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.988656e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.958011e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.958011e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.992039e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.943088e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.943088e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 2.497873 sec - 7,477,997,966 cycles # 2.988 GHz - 14,323,986,771 instructions # 1.92 insn per cycle - 2.509286823 seconds time elapsed +TOTAL : 2.500278 sec + 7,490,696,881 cycles # 2.992 GHz + 14,324,115,086 instructions # 1.91 insn per cycle + 2.512588472 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1063) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe @@ -146,14 +146,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.069355e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.311915e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.311915e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.099847e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.437862e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.437862e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 2.443303 sec - 7,292,693,775 cycles # 2.979 GHz - 14,031,211,656 instructions # 1.92 insn per cycle - 2.454703317 seconds time elapsed +TOTAL : 2.420313 sec + 7,280,482,659 cycles # 3.003 GHz + 14,031,142,533 instructions # 1.93 insn per cycle + 2.426602182 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1024) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest.exe @@ -172,14 +172,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.939779e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.745634e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.745634e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.913808e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.700903e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.700903e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 2.536500 sec - 6,571,691,043 cycles # 2.587 GHz - 10,814,207,345 instructions # 1.65 insn per cycle - 2.548985898 seconds time elapsed +TOTAL : 2.559114 sec + 6,555,384,580 cycles # 2.558 GHz + 10,814,650,468 instructions # 1.65 insn per cycle + 2.565400113 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 289) (512y: 0) (512z: 683) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt index a55de79a58..6e1b117ddd 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt @@ -35,7 +35,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2023-07-20_21:58:20 +DATE: 2023-06-16_23:25:12 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -44,14 +44,14 @@ WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.160896e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.137033e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.582591e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.189825e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.289841e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.647085e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 2.075371 sec - 7,019,752,170 cycles # 3.034 GHz - 11,903,191,172 instructions # 1.70 insn per cycle - 2.372256807 seconds time elapsed +TOTAL : 2.074835 sec + 6,914,545,522 cycles # 2.988 GHz + 11,919,156,975 instructions # 1.72 insn per cycle + 2.372897157 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost ==PROF== Profiling "sigmaKin": launch__registers_per_thread 150 @@ -70,14 +70,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.210758e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.497596e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.497596e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.202902e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.488452e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.488452e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 5.595736 sec - 17,398,813,024 cycles # 3.107 GHz - 41,067,393,878 instructions # 2.36 insn per cycle - 5.602703968 seconds time elapsed +TOTAL : 5.632592 sec + 17,383,031,697 cycles # 3.085 GHz + 41,067,827,351 instructions # 2.36 insn per cycle + 5.638706459 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 375) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe @@ -96,14 +96,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.048723e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.138363e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.138363e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.068552e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.168923e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.168923e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.471828 sec - 10,749,780,985 cycles # 3.092 GHz - 25,328,225,446 instructions # 2.36 insn per cycle - 3.484346812 seconds time elapsed +TOTAL : 3.443621 sec + 10,653,431,550 cycles # 3.090 GHz + 25,328,207,820 instructions # 2.38 insn per cycle + 3.459263800 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1283) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe @@ -122,14 +122,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.840636e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.668678e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.668678e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.993581e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.951966e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.951966e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 2.626012 sec - 7,544,466,587 cycles # 2.868 GHz - 14,324,547,691 instructions # 1.90 insn per cycle - 2.637949451 seconds time elapsed +TOTAL : 2.497810 sec + 7,472,614,101 cycles # 2.988 GHz + 14,326,081,552 instructions # 1.92 insn per cycle + 2.513276024 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1063) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe @@ -148,14 +148,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.975985e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.067183e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.067183e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.123387e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.389805e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.389805e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 2.519378 sec - 7,258,882,628 cycles # 2.875 GHz - 14,031,403,334 instructions # 1.93 insn per cycle - 2.538176734 seconds time elapsed +TOTAL : 2.406381 sec + 7,220,335,602 cycles # 2.995 GHz + 14,031,141,989 instructions # 1.94 insn per cycle + 2.412823702 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1024) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest.exe @@ -174,14 +174,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.801872e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.393339e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.393339e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.914294e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.704937e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.704937e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 2.665672 sec - 6,511,369,093 cycles # 2.442 GHz - 10,815,325,314 instructions # 1.66 insn per cycle - 2.672073669 seconds time elapsed +TOTAL : 2.563138 sec + 6,557,308,012 cycles # 2.558 GHz + 10,815,146,087 instructions # 1.65 insn per cycle + 2.575443013 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 289) (512y: 0) (512z: 683) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt index 01a6d0f42b..bd06bd6ba5 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt @@ -35,7 +35,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2023-07-20_21:03:07 +DATE: 2023-06-16_22:48:51 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -43,14 +43,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProces Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.573202e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.404217e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.095191e+09 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.643941e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.489939e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.087323e+09 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 0.729393 sec - 2,859,827,151 cycles # 2.934 GHz - 3,986,993,672 instructions # 1.39 insn per cycle - 1.207903347 seconds time elapsed +TOTAL : 0.690482 sec + 2,734,951,372 cycles # 2.921 GHz + 3,869,487,763 instructions # 1.41 insn per cycle + 1.001478108 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 118 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -68,14 +68,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.198359e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.482315e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.482315e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.201121e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.486695e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.486695e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 5.653578 sec - 17,443,809,257 cycles # 3.084 GHz - 41,016,121,895 instructions # 2.35 insn per cycle - 5.781648091 seconds time elapsed +TOTAL : 5.640974 sec + 17,377,915,107 cycles # 3.080 GHz + 41,019,735,572 instructions # 2.36 insn per cycle + 5.647089750 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 362) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/runTest.exe @@ -94,14 +94,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.048682e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.145841e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.145841e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.055579e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.158330e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.158330e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.471810 sec - 10,719,166,528 cycles # 3.083 GHz - 25,290,771,945 instructions # 2.36 insn per cycle - 3.539414387 seconds time elapsed +TOTAL : 3.461985 sec + 10,655,685,663 cycles # 3.075 GHz + 25,289,974,301 instructions # 2.37 insn per cycle + 3.474152122 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1270) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/runTest.exe @@ -120,14 +120,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.927686e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.869730e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.869730e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.953920e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.887932e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.887932e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 2.548863 sec - 7,544,094,940 cycles # 2.955 GHz - 14,295,698,107 instructions # 1.89 insn per cycle - 2.812727538 seconds time elapsed +TOTAL : 2.529478 sec + 7,488,504,684 cycles # 2.955 GHz + 14,297,973,959 instructions # 1.91 insn per cycle + 2.535527552 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1043) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/runTest.exe @@ -146,14 +146,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.078899e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.346487e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.346487e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.063577e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.335894e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.335894e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 2.436719 sec - 7,317,931,634 cycles # 2.997 GHz - 14,018,643,738 instructions # 1.92 insn per cycle - 2.502189878 seconds time elapsed +TOTAL : 2.447618 sec + 7,308,425,300 cycles # 2.980 GHz + 14,017,785,626 instructions # 1.92 insn per cycle + 2.453901778 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1004) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd1/runTest.exe @@ -172,14 +172,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.995777e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.985928e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.985928e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.005001e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.053828e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.053828e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 2.500608 sec - 6,440,866,463 cycles # 2.571 GHz - 10,694,671,248 instructions # 1.66 insn per cycle - 3.057581442 seconds time elapsed +TOTAL : 2.492808 sec + 6,433,917,466 cycles # 2.577 GHz + 10,696,732,836 instructions # 1.66 insn per cycle + 2.504690032 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 267) (512y: 0) (512z: 663) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd1/runTest.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt index 03c4722a03..4005df7354 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt @@ -35,7 +35,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2023-07-20_21:41:38 +DATE: 2023-06-16_23:08:58 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -43,14 +43,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProces Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.978619e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.210810e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.781232e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.934597e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.349196e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.751072e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 0.718304 sec - 2,802,532,857 cycles # 2.908 GHz - 3,960,989,523 instructions # 1.41 insn per cycle - 1.021805080 seconds time elapsed +TOTAL : 0.703650 sec + 2,783,628,482 cycles # 2.937 GHz + 3,817,522,930 instructions # 1.37 insn per cycle + 1.006043909 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/gcheck.exe -p 2048 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 150 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -68,14 +68,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.627066e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.553584e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.553584e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.627042e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.555043e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.555043e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 2.801602 sec - 8,543,367,725 cycles # 3.044 GHz - 17,313,586,195 instructions # 2.03 insn per cycle - 2.808318415 seconds time elapsed +TOTAL : 2.809092 sec + 8,529,730,783 cycles # 3.037 GHz + 17,314,493,447 instructions # 2.03 insn per cycle + 2.815992048 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 206) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/runTest.exe @@ -94,14 +94,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.331833e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.846013e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.846013e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.394827e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.052489e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.052489e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 2.300110 sec - 6,902,107,070 cycles # 2.994 GHz - 13,420,734,031 instructions # 1.94 insn per cycle - 2.306985565 seconds time elapsed +TOTAL : 2.257313 sec + 6,891,555,509 cycles # 3.048 GHz + 13,420,618,567 instructions # 1.95 insn per cycle + 2.263346481 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 809) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/runTest.exe @@ -120,14 +120,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.988287e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.202020e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.202020e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.953793e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.186747e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.186747e+07 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 1.971744 sec - 5,938,354,621 cycles # 3.004 GHz - 10,444,183,718 instructions # 1.76 insn per cycle - 1.983463748 seconds time elapsed +TOTAL : 1.994156 sec + 5,944,600,029 cycles # 2.973 GHz + 10,446,540,517 instructions # 1.76 insn per cycle + 2.006757717 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 460) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/runTest.exe @@ -146,14 +146,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.137178e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.313339e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.313339e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.165108e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.323250e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.323250e+07 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 1.915605 sec - 5,759,486,412 cycles # 2.999 GHz - 10,321,747,357 instructions # 1.79 insn per cycle - 1.922275147 seconds time elapsed +TOTAL : 1.906328 sec + 5,747,679,195 cycles # 3.008 GHz + 10,324,227,604 instructions # 1.80 insn per cycle + 1.912748843 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 435) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/runTest.exe @@ -172,14 +172,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.751378e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.005396e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.005396e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.709510e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.834439e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.834439e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 2.077420 sec - 5,636,831,657 cycles # 2.706 GHz - 9,347,856,156 instructions # 1.66 insn per cycle - 2.089951914 seconds time elapsed +TOTAL : 2.101712 sec + 5,645,784,770 cycles # 2.680 GHz + 9,348,170,561 instructions # 1.66 insn per cycle + 2.114432099 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 221) (512y: 0) (512z: 276) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt index 95c354a108..bc4e48c353 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt @@ -35,7 +35,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2023-07-20_21:42:01 +DATE: 2023-06-16_23:09:22 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -43,14 +43,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProces Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.069337e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.881679e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.087012e+09 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.022137e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.034901e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.081788e+09 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 0.713881 sec - 2,825,126,091 cycles # 2.911 GHz - 3,908,837,089 instructions # 1.38 insn per cycle - 1.028319097 seconds time elapsed +TOTAL : 0.701380 sec + 2,726,534,804 cycles # 2.886 GHz + 3,786,125,853 instructions # 1.39 insn per cycle + 1.004213223 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/gcheck.exe -p 2048 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 118 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -68,14 +68,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.483902e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.897855e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.897855e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.510910e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.966131e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.966131e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 2.209642 sec - 6,748,447,250 cycles # 3.048 GHz - 13,573,833,841 instructions # 2.01 insn per cycle - 2.216566349 seconds time elapsed +TOTAL : 2.194238 sec + 6,733,882,413 cycles # 3.062 GHz + 13,573,888,806 instructions # 2.02 insn per cycle + 2.200295951 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 176) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/runTest.exe @@ -94,14 +94,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.935122e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.183564e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.183564e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.963639e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.196983e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.196983e+07 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 2.005097 sec - 6,124,980,667 cycles # 3.047 GHz - 11,421,138,107 instructions # 1.86 insn per cycle - 2.011602030 seconds time elapsed +TOTAL : 1.987100 sec + 6,108,038,272 cycles # 3.066 GHz + 11,421,394,512 instructions # 1.87 insn per cycle + 1.998910641 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 609) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/runTest.exe @@ -120,14 +120,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.209785e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.412402e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.412402e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.184988e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.401255e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.401255e+07 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 1.891149 sec - 5,704,125,639 cycles # 3.007 GHz - 9,756,180,671 instructions # 1.71 insn per cycle - 1.904755924 seconds time elapsed +TOTAL : 1.904581 sec + 5,674,452,757 cycles # 2.972 GHz + 9,756,418,512 instructions # 1.72 insn per cycle + 1.916141098 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 365) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/runTest.exe @@ -146,14 +146,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.445802e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.686006e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.686006e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.421355e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.712475e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.712475e+07 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 1.812329 sec - 5,474,550,735 cycles # 3.015 GHz - 9,745,407,403 instructions # 1.78 insn per cycle - 1.825467138 seconds time elapsed +TOTAL : 1.818113 sec + 5,526,907,542 cycles # 3.034 GHz + 9,745,448,132 instructions # 1.76 insn per cycle + 1.824604908 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 356) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd1/runTest.exe @@ -172,14 +172,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.878707e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.095160e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.095160e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.924922e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.119550e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.119550e+07 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 2.025891 sec - 5,532,892,920 cycles # 2.724 GHz - 9,060,665,277 instructions # 1.64 insn per cycle - 2.032766999 seconds time elapsed +TOTAL : 2.000527 sec + 5,531,314,603 cycles # 2.758 GHz + 9,060,806,562 instructions # 1.64 insn per cycle + 2.012991860 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 189) (512y: 0) (512z: 227) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd1/runTest.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt index 52f8be9eb2..998d7298b5 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt @@ -35,7 +35,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2023-07-20_21:03:38 +DATE: 2023-06-16_22:49:20 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -43,14 +43,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProces Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=2, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.097627e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.185337e+09 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.805015e+09 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.622816e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.472400e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.786994e+09 ) sec^-1 MeanMatrixElemValue = ( 1.371686e-02 +- 3.270219e-06 ) GeV^0 -TOTAL : 0.613943 sec - 2,462,194,602 cycles # 2.897 GHz - 3,447,627,675 instructions # 1.40 insn per cycle - 1.038863806 seconds time elapsed +TOTAL : 0.583321 sec + 2,430,104,692 cycles # 2.932 GHz + 3,363,264,752 instructions # 1.38 insn per cycle + 0.886216701 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 96 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -68,14 +68,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=6, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.262782e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.521574e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.521574e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.264259e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.523188e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.523188e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270373e-06 ) GeV^0 -TOTAL : 5.344273 sec - 16,511,296,687 cycles # 3.088 GHz - 40,100,950,089 instructions # 2.43 insn per cycle - 5.587823656 seconds time elapsed +TOTAL : 5.337690 sec + 16,489,915,021 cycles # 3.088 GHz + 40,104,655,673 instructions # 2.43 insn per cycle + 5.343808523 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 368) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe @@ -94,14 +94,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=6, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.223607e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.141677e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.141677e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.262321e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.200988e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.200988e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270373e-06 ) GeV^0 -TOTAL : 2.313200 sec - 7,157,550,921 cycles # 3.087 GHz - 16,747,542,246 instructions # 2.34 insn per cycle - 2.460375414 seconds time elapsed +TOTAL : 2.286472 sec + 7,103,303,250 cycles # 3.101 GHz + 16,746,623,366 instructions # 2.36 insn per cycle + 2.292241843 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1363) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe @@ -120,14 +120,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=4, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.546283e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.215779e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.215779e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.581033e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.234851e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.234851e+07 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270338e-06 ) GeV^0 -TOTAL : 1.741843 sec - 5,252,137,284 cycles # 3.006 GHz - 10,647,367,648 instructions # 2.03 insn per cycle - 1.929182358 seconds time elapsed +TOTAL : 1.728168 sec + 5,223,225,548 cycles # 3.015 GHz + 10,646,468,952 instructions # 2.04 insn per cycle + 1.739591338 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1140) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe @@ -146,14 +146,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=4, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.688942e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.317240e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.317240e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.727128e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.320782e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.320782e+07 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270338e-06 ) GeV^0 -TOTAL : 1.701494 sec - 5,143,242,422 cycles # 3.012 GHz - 10,500,737,864 instructions # 2.04 insn per cycle - 1.820063429 seconds time elapsed +TOTAL : 1.685516 sec + 5,127,500,119 cycles # 3.035 GHz + 10,500,102,407 instructions # 2.05 insn per cycle + 1.698440130 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1092) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest.exe @@ -172,14 +172,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=4, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.447053e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.138591e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.138591e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.545044e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.174805e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.174805e+07 ) sec^-1 MeanMatrixElemValue = ( 1.371705e-02 +- 3.270339e-06 ) GeV^0 -TOTAL : 1.779028 sec - 4,760,792,224 cycles # 2.667 GHz - 8,949,201,063 instructions # 1.88 insn per cycle - 1.858730004 seconds time elapsed +TOTAL : 1.747234 sec + 4,737,364,431 cycles # 2.712 GHz + 8,948,917,615 instructions # 1.89 insn per cycle + 1.753348775 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 408) (512y: 0) (512z: 710) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt index 2cfdbc9492..eea602cb6a 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt @@ -35,7 +35,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2023-07-20_21:52:31 +DATE: 2023-06-16_23:19:22 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -54,14 +54,14 @@ WARNING! flagging abnormal ME for ievt=247522 Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=7, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.107844e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.622066e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.622066e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.151373e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.051120e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.051120e+07 ) sec^-1 MeanMatrixElemValue = ( 1.371709e-02 +- 3.270386e-06 ) GeV^0 -TOTAL : 1.714869 sec - 5,864,266,699 cycles # 3.006 GHz - 10,145,301,613 instructions # 1.73 insn per cycle - 2.009444104 seconds time elapsed +TOTAL : 1.716284 sec + 5,788,689,598 cycles # 2.965 GHz + 10,066,532,076 instructions # 1.74 insn per cycle + 2.010969778 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost @@ -90,14 +90,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=6, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.234208e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.483162e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.483162e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.231634e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.480435e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.480435e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270373e-06 ) GeV^0 -TOTAL : 5.566301 sec - 17,193,101,838 cycles # 3.086 GHz - 40,268,743,710 instructions # 2.34 insn per cycle - 5.573566469 seconds time elapsed +TOTAL : 5.578840 sec + 17,225,374,898 cycles # 3.086 GHz + 40,276,553,142 instructions # 2.34 insn per cycle + 5.585821624 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 368) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe @@ -123,14 +123,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=6, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.043485e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.481971e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.481971e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.937007e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.345782e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.345782e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270373e-06 ) GeV^0 -TOTAL : 2.548754 sec - 7,888,164,892 cycles # 3.089 GHz - 18,078,170,390 instructions # 2.29 insn per cycle - 2.560940285 seconds time elapsed +TOTAL : 2.639278 sec + 7,955,090,673 cycles # 3.008 GHz + 18,082,112,251 instructions # 2.27 insn per cycle + 2.646477973 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1363) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe @@ -154,14 +154,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=4, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.201504e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.002675e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.002675e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.189753e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.014027e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.014027e+07 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270338e-06 ) GeV^0 -TOTAL : 1.979155 sec - 5,973,636,348 cycles # 3.010 GHz - 11,767,329,571 instructions # 1.97 insn per cycle - 1.994751835 seconds time elapsed +TOTAL : 1.979778 sec + 5,995,131,291 cycles # 3.020 GHz + 11,763,127,657 instructions # 1.96 insn per cycle + 1.992584680 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1140) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe @@ -185,14 +185,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=4, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.335585e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.066980e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.066980e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.319958e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.080076e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.080076e+07 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270338e-06 ) GeV^0 -TOTAL : 1.929553 sec - 5,873,544,533 cycles # 3.034 GHz - 11,620,758,858 instructions # 1.98 insn per cycle - 1.937130451 seconds time elapsed +TOTAL : 1.934129 sec + 5,894,246,896 cycles # 3.038 GHz + 11,620,616,390 instructions # 1.97 insn per cycle + 1.941396703 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1092) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest.exe @@ -216,14 +216,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=4, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.098852e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.352591e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.352591e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.141992e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.375055e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.375055e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371705e-02 +- 3.270339e-06 ) GeV^0 -TOTAL : 2.017840 sec - 5,557,685,542 cycles # 2.746 GHz - 10,155,804,635 instructions # 1.83 insn per cycle - 2.031581647 seconds time elapsed +TOTAL : 1.999503 sec + 5,512,078,856 cycles # 2.749 GHz + 10,155,647,564 instructions # 1.84 insn per cycle + 2.016503539 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 408) (512y: 0) (512z: 710) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt index 2bddc59b30..800a4b8c86 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt @@ -35,7 +35,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2023-07-20_22:04:53 +DATE: 2023-06-16_23:31:42 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -43,14 +43,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProces Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.313108e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.282917e+09 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.712923e+09 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.394086e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.330155e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.707146e+09 ) sec^-1 MeanMatrixElemValue = ( 1.371863e-02 +- 3.269951e-06 ) GeV^0 -TOTAL : 1.190378 sec - 4,238,174,422 cycles # 2.983 GHz - 6,329,606,995 instructions # 1.49 insn per cycle - 1.477276067 seconds time elapsed +TOTAL : 1.184633 sec + 4,183,254,985 cycles # 2.960 GHz + 6,293,699,635 instructions # 1.50 insn per cycle + 1.472409896 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --common ==PROF== Profiling "sigmaKin": launch__registers_per_thread 96 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -68,14 +68,14 @@ Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=4, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.254969e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.512364e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.512364e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.262596e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.522021e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.522021e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371887e-02 +- 3.270265e-06 ) GeV^0 -TOTAL : 5.701175 sec - 17,525,941,279 cycles # 3.074 GHz - 40,292,526,400 instructions # 2.30 insn per cycle - 5.707670702 seconds time elapsed +TOTAL : 5.659502 sec + 17,477,401,058 cycles # 3.088 GHz + 40,266,459,883 instructions # 2.30 insn per cycle + 5.665157466 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 368) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe @@ -94,14 +94,14 @@ Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=4, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.228211e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.125267e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.125267e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.236448e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.142806e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.142806e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371887e-02 +- 3.270265e-06 ) GeV^0 -TOTAL : 2.623722 sec - 8,099,777,030 cycles # 3.083 GHz - 16,832,747,929 instructions # 2.08 insn per cycle - 2.635391834 seconds time elapsed +TOTAL : 2.618721 sec + 8,103,544,055 cycles # 3.089 GHz + 16,832,855,817 instructions # 2.08 insn per cycle + 2.624569274 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1363) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe @@ -120,14 +120,14 @@ Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=4, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.485901e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.210303e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.210303e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.539437e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.230738e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.230738e+07 ) sec^-1 MeanMatrixElemValue = ( 1.371885e-02 +- 3.270110e-06 ) GeV^0 -TOTAL : 2.079174 sec - 6,267,675,940 cycles # 3.008 GHz - 10,563,218,685 instructions # 1.69 insn per cycle - 2.091876958 seconds time elapsed +TOTAL : 2.059577 sec + 6,252,474,158 cycles # 3.029 GHz + 10,562,743,863 instructions # 1.69 insn per cycle + 2.071952790 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1140) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe @@ -146,14 +146,14 @@ Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=4, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.672664e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.300192e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.300192e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.617687e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.289170e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.289170e+07 ) sec^-1 MeanMatrixElemValue = ( 1.371885e-02 +- 3.270110e-06 ) GeV^0 -TOTAL : 2.030540 sec - 6,140,087,237 cycles # 3.016 GHz - 10,214,892,588 instructions # 1.66 insn per cycle - 2.036660743 seconds time elapsed +TOTAL : 2.052194 sec + 6,160,094,141 cycles # 2.996 GHz + 10,211,164,443 instructions # 1.66 insn per cycle + 2.069184232 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1092) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest.exe @@ -172,14 +172,14 @@ Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=4, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.479102e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.148649e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.148649e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.509722e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.158916e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.158916e+07 ) sec^-1 MeanMatrixElemValue = ( 1.371884e-02 +- 3.270111e-06 ) GeV^0 -TOTAL : 2.085915 sec - 5,778,389,555 cycles # 2.766 GHz - 8,663,226,822 instructions # 1.50 insn per cycle - 2.091863470 seconds time elapsed +TOTAL : 2.079386 sec + 5,756,219,942 cycles # 2.763 GHz + 8,663,173,729 instructions # 1.51 insn per cycle + 2.085435176 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 408) (512y: 0) (512z: 710) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_curhst.txt index b36b1df9e0..440dcefee1 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_curhst.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_curhst.txt @@ -35,7 +35,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2023-07-20_22:01:52 +DATE: 2023-06-16_23:28:42 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -43,14 +43,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProces Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=2, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.328684e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.310164e+09 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.793402e+09 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.402219e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.355346e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.784726e+09 ) sec^-1 MeanMatrixElemValue = ( 1.371686e-02 +- 3.270219e-06 ) GeV^0 -TOTAL : 0.868346 sec - 3,242,552,600 cycles # 2.959 GHz - 6,196,904,395 instructions # 1.91 insn per cycle - 1.155213782 seconds time elapsed +TOTAL : 0.867299 sec + 3,217,543,414 cycles # 2.926 GHz + 6,137,693,119 instructions # 1.91 insn per cycle + 1.157361320 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --curhst ==PROF== Profiling "sigmaKin": launch__registers_per_thread 96 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -68,14 +68,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=6, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.256009e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.514087e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.514087e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.257010e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.513861e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.513861e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270373e-06 ) GeV^0 -TOTAL : 5.371440 sec - 16,510,007,988 cycles # 3.071 GHz - 40,104,456,149 instructions # 2.43 insn per cycle - 5.377502781 seconds time elapsed +TOTAL : 5.368758 sec + 16,502,939,001 cycles # 3.072 GHz + 40,105,024,525 instructions # 2.43 insn per cycle + 5.374937445 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 368) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe @@ -94,14 +94,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=6, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.231735e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.113308e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.113308e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.242270e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.146410e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.146410e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270373e-06 ) GeV^0 -TOTAL : 2.309242 sec - 7,090,993,576 cycles # 3.067 GHz - 16,744,502,151 instructions # 2.36 insn per cycle - 2.314862812 seconds time elapsed +TOTAL : 2.303063 sec + 7,094,904,869 cycles # 3.075 GHz + 16,746,800,277 instructions # 2.36 insn per cycle + 2.314822596 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1363) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe @@ -120,14 +120,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=4, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.591509e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.229477e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.229477e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.586979e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.231913e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.231913e+07 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270338e-06 ) GeV^0 -TOTAL : 1.724883 sec - 5,198,407,236 cycles # 3.005 GHz - 10,646,573,554 instructions # 2.05 insn per cycle - 1.737430076 seconds time elapsed +TOTAL : 1.807105 sec + 5,428,964,238 cycles # 2.996 GHz + 10,692,005,584 instructions # 1.97 insn per cycle + 1.822790793 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1140) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe @@ -146,14 +146,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=4, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.711754e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.317700e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.317700e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.726711e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.329947e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.329947e+07 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270338e-06 ) GeV^0 -TOTAL : 1.693232 sec - 5,106,489,382 cycles # 3.007 GHz - 10,500,313,731 instructions # 2.06 insn per cycle - 1.699710936 seconds time elapsed +TOTAL : 1.687490 sec + 5,097,137,843 cycles # 3.011 GHz + 10,500,104,402 instructions # 2.06 insn per cycle + 1.704174033 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1092) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest.exe @@ -172,14 +172,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=4, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.579719e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.180019e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.180019e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.531072e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.167602e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.167602e+07 ) sec^-1 MeanMatrixElemValue = ( 1.371705e-02 +- 3.270339e-06 ) GeV^0 -TOTAL : 1.728497 sec - 4,725,938,024 cycles # 2.727 GHz - 8,948,218,285 instructions # 1.89 insn per cycle - 1.734586729 seconds time elapsed +TOTAL : 1.748105 sec + 4,711,439,440 cycles # 2.688 GHz + 8,948,315,074 instructions # 1.90 insn per cycle + 1.760555512 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 408) (512y: 0) (512z: 710) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt index fb60477383..6541a30c4f 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt @@ -35,7 +35,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2023-07-20_21:58:51 +DATE: 2023-06-16_23:25:42 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -44,14 +44,14 @@ WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=7, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.387065e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.245343e+09 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.560379e+09 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.106073e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.321114e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.631808e+09 ) sec^-1 MeanMatrixElemValue = ( 1.371709e-02 +- 3.270386e-06 ) GeV^0 -TOTAL : 1.583187 sec - 5,150,918,663 cycles # 2.835 GHz - 9,058,229,610 instructions # 1.76 insn per cycle - 1.874132749 seconds time elapsed +TOTAL : 1.491290 sec + 5,169,582,957 cycles # 2.984 GHz + 9,050,754,175 instructions # 1.75 insn per cycle + 1.789532913 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost ==PROF== Profiling "sigmaKin": launch__registers_per_thread 96 @@ -70,14 +70,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=6, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.221079e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.469897e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.469897e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.266280e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.526041e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.526041e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270373e-06 ) GeV^0 -TOTAL : 5.525262 sec - 16,531,332,189 cycles # 2.991 GHz - 40,105,741,831 instructions # 2.43 insn per cycle - 5.532126555 seconds time elapsed +TOTAL : 5.330040 sec + 16,494,480,958 cycles # 3.092 GHz + 40,104,668,257 instructions # 2.43 insn per cycle + 5.336402582 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 368) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe @@ -96,14 +96,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=6, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.075987e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.810037e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.810037e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.242421e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.129823e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.129823e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270373e-06 ) GeV^0 -TOTAL : 2.425690 sec - 7,110,249,965 cycles # 2.926 GHz - 16,747,192,802 instructions # 2.36 insn per cycle - 2.438192485 seconds time elapsed +TOTAL : 2.301776 sec + 7,083,962,972 cycles # 3.073 GHz + 16,746,671,271 instructions # 2.36 insn per cycle + 2.307495453 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1363) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe @@ -122,14 +122,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=4, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.479858e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.200697e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.200697e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.610071e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.237721e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.237721e+07 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270338e-06 ) GeV^0 -TOTAL : 1.770125 sec - 5,211,859,314 cycles # 2.936 GHz - 10,646,762,565 instructions # 2.04 insn per cycle - 1.783056514 seconds time elapsed +TOTAL : 1.717055 sec + 5,214,546,292 cycles # 3.028 GHz + 10,646,475,199 instructions # 2.04 insn per cycle + 1.729853327 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1140) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe @@ -148,14 +148,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=4, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.477835e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.246515e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.246515e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.669054e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.286554e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.286554e+07 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270338e-06 ) GeV^0 -TOTAL : 1.780615 sec - 5,119,952,797 cycles # 2.867 GHz - 10,500,503,418 instructions # 2.05 insn per cycle - 1.793552507 seconds time elapsed +TOTAL : 1.706696 sec + 5,116,265,885 cycles # 2.990 GHz + 10,500,169,607 instructions # 2.05 insn per cycle + 1.718833211 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1092) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest.exe @@ -174,14 +174,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=4, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.477036e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.144484e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.144484e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.528217e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.171325e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.171325e+07 ) sec^-1 MeanMatrixElemValue = ( 1.371705e-02 +- 3.270339e-06 ) GeV^0 -TOTAL : 1.762289 sec - 4,755,674,056 cycles # 2.692 GHz - 8,948,422,313 instructions # 1.88 insn per cycle - 1.776599430 seconds time elapsed +TOTAL : 1.745381 sec + 4,755,213,854 cycles # 2.718 GHz + 8,948,305,241 instructions # 1.88 insn per cycle + 1.757628494 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 408) (512y: 0) (512z: 710) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt index cc9c8cf98a..278cecd3e4 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt @@ -35,7 +35,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2023-07-20_21:04:06 +DATE: 2023-06-16_22:49:45 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -43,14 +43,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProces Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=2, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.100984e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.206064e+09 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.905406e+09 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.626450e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.505770e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.909932e+09 ) sec^-1 MeanMatrixElemValue = ( 1.371686e-02 +- 3.270219e-06 ) GeV^0 -TOTAL : 0.610136 sec - 2,463,272,829 cycles # 2.910 GHz - 3,434,339,299 instructions # 1.39 insn per cycle - 1.284031869 seconds time elapsed +TOTAL : 0.584101 sec + 2,373,467,870 cycles # 2.893 GHz + 3,334,548,453 instructions # 1.40 insn per cycle + 0.877449920 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 80 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -68,14 +68,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=6, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.260762e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.519765e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.519765e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.259578e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.517908e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.517908e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270373e-06 ) GeV^0 -TOTAL : 5.354216 sec - 16,544,884,221 cycles # 3.087 GHz - 40,053,108,251 instructions # 2.42 insn per cycle - 5.408703557 seconds time elapsed +TOTAL : 5.356459 sec + 16,505,080,474 cycles # 3.080 GHz + 40,054,237,440 instructions # 2.43 insn per cycle + 5.362931581 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 351) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/runTest.exe @@ -94,14 +94,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=6, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.215023e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.093991e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.093991e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.233010e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.140192e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.140192e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270373e-06 ) GeV^0 -TOTAL : 2.321486 sec - 7,123,570,979 cycles # 3.062 GHz - 16,671,354,222 instructions # 2.34 insn per cycle - 2.623084982 seconds time elapsed +TOTAL : 2.305928 sec + 7,093,977,561 cycles # 3.071 GHz + 16,670,395,724 instructions # 2.35 insn per cycle + 2.317849748 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1338) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/runTest.exe @@ -120,14 +120,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=4, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.554553e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.231825e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.231825e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.492331e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.196200e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.196200e+07 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270338e-06 ) GeV^0 -TOTAL : 1.738890 sec - 5,258,481,013 cycles # 3.015 GHz - 10,633,983,479 instructions # 2.02 insn per cycle - 2.031221831 seconds time elapsed +TOTAL : 1.759548 sec + 5,233,152,516 cycles # 2.967 GHz + 10,634,018,962 instructions # 2.03 insn per cycle + 1.771464615 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1110) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/runTest.exe @@ -146,14 +146,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=4, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.590256e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.320281e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.320281e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.707001e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.313255e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.313255e+07 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270338e-06 ) GeV^0 -TOTAL : 1.727761 sec - 5,261,126,335 cycles # 3.034 GHz - 10,493,915,684 instructions # 1.99 insn per cycle - 2.080009194 seconds time elapsed +TOTAL : 1.696242 sec + 5,111,285,292 cycles # 3.005 GHz + 10,493,325,522 instructions # 2.05 insn per cycle + 1.711751099 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1062) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd1/runTest.exe @@ -172,14 +172,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=4, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.697267e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.267441e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.267441e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.642010e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.241108e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.241108e+07 ) sec^-1 MeanMatrixElemValue = ( 1.371705e-02 +- 3.270339e-06 ) GeV^0 -TOTAL : 1.695110 sec - 4,667,181,727 cycles # 2.744 GHz - 8,877,988,853 instructions # 1.90 insn per cycle - 1.868095680 seconds time elapsed +TOTAL : 1.712084 sec + 4,653,252,953 cycles # 2.711 GHz + 8,877,573,860 instructions # 1.91 insn per cycle + 1.718230554 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 377) (512y: 0) (512z: 678) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/runTest.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt index 74c9754827..212456d513 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt @@ -35,7 +35,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2023-07-20_21:42:24 +DATE: 2023-06-16_23:09:44 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -43,14 +43,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProces Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=2, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.096867e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.184386e+09 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.791475e+09 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.391928e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.360941e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.806903e+09 ) sec^-1 MeanMatrixElemValue = ( 1.371686e-02 +- 3.270219e-06 ) GeV^0 -TOTAL : 0.609431 sec - 2,467,715,938 cycles # 2.929 GHz - 3,466,891,826 instructions # 1.40 insn per cycle - 0.900213020 seconds time elapsed +TOTAL : 0.595523 sec + 2,413,044,519 cycles # 2.909 GHz + 3,371,780,308 instructions # 1.40 insn per cycle + 0.887688074 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/gcheck.exe -p 2048 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 96 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -68,14 +68,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=6, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.012737e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.065479e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.065479e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.031021e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.106428e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.106428e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270373e-06 ) GeV^0 -TOTAL : 2.453624 sec - 7,421,628,585 cycles # 3.019 GHz - 16,634,148,867 instructions # 2.24 insn per cycle - 2.460468767 seconds time elapsed +TOTAL : 2.437957 sec + 7,408,174,050 cycles # 3.033 GHz + 16,633,646,919 instructions # 2.25 insn per cycle + 2.444238664 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 226) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/runTest.exe @@ -94,14 +94,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=6, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.487661e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.299067e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.299067e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.503700e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.305405e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.305405e+07 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270373e-06 ) GeV^0 -TOTAL : 1.766145 sec - 5,435,578,305 cycles # 3.071 GHz - 11,183,236,880 instructions # 2.06 insn per cycle - 1.779233345 seconds time elapsed +TOTAL : 1.757246 sec + 5,418,067,590 cycles # 3.076 GHz + 11,183,088,134 instructions # 2.06 insn per cycle + 1.763156753 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 532) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/runTest.exe @@ -120,14 +120,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=4, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.728284e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.635625e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.635625e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.726065e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.682697e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.682697e+07 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270338e-06 ) GeV^0 -TOTAL : 1.453301 sec - 4,419,042,730 cycles # 3.030 GHz - 8,688,677,902 instructions # 1.97 insn per cycle - 1.466098468 seconds time elapsed +TOTAL : 1.451397 sec + 4,426,469,159 cycles # 3.040 GHz + 8,688,961,662 instructions # 1.96 insn per cycle + 1.457509989 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 530) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/runTest.exe @@ -146,14 +146,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=4, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.744373e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.749764e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.749764e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.791450e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.780450e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.780450e+07 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270338e-06 ) GeV^0 -TOTAL : 1.455693 sec - 4,382,746,772 cycles # 3.002 GHz - 8,633,835,601 instructions # 1.97 insn per cycle - 1.467002909 seconds time elapsed +TOTAL : 1.446231 sec + 4,397,730,279 cycles # 3.031 GHz + 8,635,389,369 instructions # 1.96 insn per cycle + 1.458743810 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 502) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/runTest.exe @@ -172,14 +172,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=4, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.519353e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.183891e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.183891e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.422680e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.137968e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.137968e+07 ) sec^-1 MeanMatrixElemValue = ( 1.371705e-02 +- 3.270339e-06 ) GeV^0 -TOTAL : 1.497900 sec - 4,248,985,836 cycles # 2.826 GHz - 8,218,002,385 instructions # 1.93 insn per cycle - 1.510189260 seconds time elapsed +TOTAL : 1.522551 sec + 4,240,504,189 cycles # 2.777 GHz + 8,218,517,462 instructions # 1.94 insn per cycle + 1.543890893 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 345) (512y: 0) (512z: 301) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt index b7dc434092..ac5f47f7f2 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt @@ -35,7 +35,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2023-07-20_21:42:45 +DATE: 2023-06-16_23:10:05 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -43,14 +43,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProces Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=2, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.099332e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.204196e+09 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.915255e+09 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.391214e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.376725e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.873588e+09 ) sec^-1 MeanMatrixElemValue = ( 1.371686e-02 +- 3.270219e-06 ) GeV^0 -TOTAL : 0.612266 sec - 2,464,042,899 cycles # 2.911 GHz - 3,475,864,574 instructions # 1.41 insn per cycle - 0.904505664 seconds time elapsed +TOTAL : 0.594001 sec + 2,406,505,129 cycles # 2.898 GHz + 3,366,385,454 instructions # 1.40 insn per cycle + 0.891567175 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/gcheck.exe -p 2048 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 80 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -68,14 +68,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=6, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.020524e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.020477e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.020477e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.071244e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.178096e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.178096e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270373e-06 ) GeV^0 -TOTAL : 1.937315 sec - 5,817,973,732 cycles # 2.998 GHz - 12,919,490,172 instructions # 2.22 insn per cycle - 1.943720405 seconds time elapsed +TOTAL : 1.909389 sec + 5,830,536,600 cycles # 3.046 GHz + 12,919,583,380 instructions # 2.22 insn per cycle + 1.916035180 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 196) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/runTest.exe @@ -94,14 +94,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=6, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.005682e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.885289e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.885289e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.045068e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.916048e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.916048e+07 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270373e-06 ) GeV^0 -TOTAL : 1.622803 sec - 4,951,349,178 cycles # 3.043 GHz - 9,983,188,986 instructions # 2.02 insn per cycle - 1.635045128 seconds time elapsed +TOTAL : 1.609251 sec + 4,943,298,493 cycles # 3.063 GHz + 9,983,440,182 instructions # 2.02 insn per cycle + 1.615247999 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 391) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/runTest.exe @@ -120,14 +120,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=4, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.099359e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.555505e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.555505e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.077418e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.611229e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.611229e+07 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270338e-06 ) GeV^0 -TOTAL : 1.383980 sec - 4,248,113,946 cycles # 3.059 GHz - 8,332,285,269 instructions # 1.96 insn per cycle - 1.390257290 seconds time elapsed +TOTAL : 1.386065 sec + 4,257,966,284 cycles # 3.061 GHz + 8,332,432,046 instructions # 1.96 insn per cycle + 1.401917759 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 418) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/runTest.exe @@ -146,14 +146,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=4, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.179241e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.843222e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.843222e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.185758e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.872361e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.872361e+07 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270338e-06 ) GeV^0 -TOTAL : 1.374154 sec - 4,200,607,975 cycles # 3.046 GHz - 8,344,023,002 instructions # 1.99 insn per cycle - 1.380829848 seconds time elapsed +TOTAL : 1.368255 sec + 4,217,890,128 cycles # 3.071 GHz + 8,344,168,177 instructions # 1.98 insn per cycle + 1.374282023 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 404) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/runTest.exe @@ -172,14 +172,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=4, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.676137e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.526864e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.526864e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.831994e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.619622e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.619622e+07 ) sec^-1 MeanMatrixElemValue = ( 1.371705e-02 +- 3.270339e-06 ) GeV^0 -TOTAL : 1.470822 sec - 4,189,822,650 cycles # 2.837 GHz - 8,053,565,427 instructions # 1.92 insn per cycle - 1.489148368 seconds time elapsed +TOTAL : 1.433154 sec + 4,169,968,523 cycles # 2.901 GHz + 8,053,769,834 instructions # 1.93 insn per cycle + 1.439092794 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 297) (512y: 0) (512z: 234) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/runTest.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt index 1a047ee29b..6dfac9c1ed 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt @@ -35,7 +35,7 @@ CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2023-07-20_21:04:33 +DATE: 2023-06-16_22:50:10 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -43,14 +43,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProces Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.495972e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.803969e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.765235e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.376494e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.953518e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.671712e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 0.811348 sec - 2,855,784,020 cycles # 2.902 GHz - 4,072,692,460 instructions # 1.43 insn per cycle - 1.359642965 seconds time elapsed +TOTAL : 0.688402 sec + 2,700,826,195 cycles # 2.905 GHz + 3,806,980,667 instructions # 1.41 insn per cycle + 0.989702055 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 150 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -68,14 +68,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.182316e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.457012e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.457012e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.180803e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.453359e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.453359e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 5.728144 sec - 17,673,216,488 cycles # 3.083 GHz - 41,243,579,659 instructions # 2.33 insn per cycle - 5.779259471 seconds time elapsed +TOTAL : 5.728686 sec + 17,708,432,158 cycles # 3.090 GHz + 41,244,089,604 instructions # 2.33 insn per cycle + 5.734765056 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 377) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/runTest.exe @@ -94,14 +94,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.082176e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.214997e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.214997e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.063645e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.188199e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.188199e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.425697 sec - 10,595,307,110 cycles # 3.089 GHz - 25,489,411,975 instructions # 2.41 insn per cycle - 3.562727854 seconds time elapsed +TOTAL : 3.450990 sec + 10,583,383,925 cycles # 3.064 GHz + 25,489,452,356 instructions # 2.41 insn per cycle + 3.462562119 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1316) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/runTest.exe @@ -120,14 +120,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.040460e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.094813e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.094813e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.037721e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.147402e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.147402e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 2.464838 sec - 7,419,964,789 cycles # 3.003 GHz - 14,282,393,000 instructions # 1.92 insn per cycle - 2.780056473 seconds time elapsed +TOTAL : 2.460588 sec + 7,441,510,421 cycles # 3.018 GHz + 14,282,092,763 instructions # 1.92 insn per cycle + 2.472351528 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1222) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/runTest.exe @@ -146,14 +146,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.113996e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.513835e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.513835e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.158847e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.607374e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.607374e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 2.414652 sec - 7,234,011,677 cycles # 2.989 GHz - 13,977,869,835 instructions # 1.93 insn per cycle - 3.306544804 seconds time elapsed +TOTAL : 2.383448 sec + 7,184,403,404 cycles # 3.008 GHz + 13,977,543,468 instructions # 1.95 insn per cycle + 2.395596096 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1170) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/runTest.exe @@ -172,14 +172,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.883810e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.597744e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.597744e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.933204e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.709618e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.709618e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 2.590521 sec - 6,554,526,949 cycles # 2.526 GHz - 10,868,521,395 instructions # 1.66 insn per cycle - 2.860310662 seconds time elapsed +TOTAL : 2.541646 sec + 6,535,344,396 cycles # 2.567 GHz + 10,866,787,933 instructions # 1.66 insn per cycle + 2.548121091 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 473) (512y: 0) (512z: 707) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt index 4831b5d8e9..a6191f8a49 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt @@ -35,7 +35,7 @@ CUDACPP_BUILDDIR='build.512z_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2023-07-20_21:05:06 +DATE: 2023-06-16_22:50:39 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -43,14 +43,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProces Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.558572e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.375377e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.080483e+09 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.486758e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.407739e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.073604e+09 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 0.725071 sec - 2,830,620,305 cycles # 2.911 GHz - 4,078,454,550 instructions # 1.44 insn per cycle - 1.184375737 seconds time elapsed +TOTAL : 0.687151 sec + 2,695,987,556 cycles # 2.909 GHz + 3,846,627,469 instructions # 1.43 insn per cycle + 0.990030497 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 118 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -68,14 +68,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.179454e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.453200e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.453200e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.184791e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.461890e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.461890e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 5.735943 sec - 17,659,725,406 cycles # 3.076 GHz - 41,192,294,387 instructions # 2.33 insn per cycle - 5.768738374 seconds time elapsed +TOTAL : 5.709105 sec + 17,648,992,290 cycles # 3.090 GHz + 41,192,633,916 instructions # 2.33 insn per cycle + 5.715345529 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 364) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/runTest.exe @@ -94,14 +94,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.055464e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.165706e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.165706e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.065279e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.171891e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.171891e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.461303 sec - 10,675,041,912 cycles # 3.078 GHz - 25,450,477,117 instructions # 2.38 insn per cycle - 3.527134520 seconds time elapsed +TOTAL : 3.445809 sec + 10,635,101,807 cycles # 3.083 GHz + 25,450,128,846 instructions # 2.39 insn per cycle + 3.457774379 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1303) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/runTest.exe @@ -120,14 +120,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.982353e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.005232e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.005232e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.055789e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.156027e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.156027e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 2.513517 sec - 7,442,164,804 cycles # 2.954 GHz - 14,257,301,905 instructions # 1.92 insn per cycle - 2.558727207 seconds time elapsed +TOTAL : 2.453672 sec + 7,396,722,782 cycles # 3.009 GHz + 14,256,099,046 instructions # 1.93 insn per cycle + 2.460024595 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1202) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/runTest.exe @@ -146,14 +146,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.160981e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.600668e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.600668e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.105741e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.538001e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.538001e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 2.385626 sec - 7,155,283,127 cycles # 2.994 GHz - 13,964,581,713 instructions # 1.95 insn per cycle - 2.440059504 seconds time elapsed +TOTAL : 2.423016 sec + 7,227,506,496 cycles # 2.978 GHz + 13,966,530,659 instructions # 1.93 insn per cycle + 2.435636437 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1150) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd1/runTest.exe @@ -172,14 +172,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.982337e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.944601e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.944601e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.961514e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.946738e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.946738e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 2.508312 sec - 6,474,260,351 cycles # 2.576 GHz - 10,746,769,958 instructions # 1.66 insn per cycle - 2.557986329 seconds time elapsed +TOTAL : 2.525122 sec + 6,474,425,117 cycles # 2.561 GHz + 10,746,498,563 instructions # 1.66 insn per cycle + 2.536989466 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 453) (512y: 0) (512z: 688) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd1/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt index ca4961374c..a72bc96234 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt @@ -35,7 +35,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2023-07-20_21:05:37 +DATE: 2023-07-20_17:36:31 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -43,14 +43,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.010666e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.136620e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.273568e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.003367e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.129514e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.264428e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.562217 sec - 2,309,566,069 cycles # 2.862 GHz - 2,931,235,249 instructions # 1.27 insn per cycle - 1.192343544 seconds time elapsed +TOTAL : 0.559201 sec + 2,344,901,473 cycles # 2.919 GHz + 2,930,110,698 instructions # 1.25 insn per cycle + 0.861924488 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -68,14 +68,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.973676e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.037445e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.037445e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.900323e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.964314e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.964314e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 5.424483 sec - 16,850,631,043 cycles # 3.104 GHz - 45,503,336,916 instructions # 2.70 insn per cycle - 5.465565905 seconds time elapsed +TOTAL : 5.666953 sec + 17,034,536,839 cycles # 3.000 GHz + 45,603,039,359 instructions # 2.68 insn per cycle + 5.680979221 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 624) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe @@ -94,14 +94,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.455990e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.675616e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.675616e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.403774e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.621403e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.621403e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.145030 sec - 9,719,201,416 cycles # 3.086 GHz - 27,313,311,809 instructions # 2.81 insn per cycle - 3.536685405 seconds time elapsed +TOTAL : 3.226902 sec + 9,890,519,824 cycles # 3.053 GHz + 27,411,762,268 instructions # 2.77 insn per cycle + 3.245927523 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2528) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe @@ -120,14 +120,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.040366e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.716195e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.716195e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.940305e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.630785e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.630785e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.843970 sec - 5,401,724,413 cycles # 2.920 GHz - 11,620,210,424 instructions # 2.15 insn per cycle - 1.949159414 seconds time elapsed +TOTAL : 1.911307 sec + 5,554,271,159 cycles # 2.887 GHz + 11,718,779,585 instructions # 2.11 insn per cycle + 1.930632243 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2391) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe @@ -146,14 +146,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.571696e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.385310e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.385310e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.524135e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.359151e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.359151e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.703083 sec - 5,009,501,463 cycles # 2.930 GHz - 11,044,838,584 instructions # 2.20 insn per cycle - 1.772528046 seconds time elapsed +TOTAL : 1.751816 sec + 5,153,612,038 cycles # 2.921 GHz + 11,141,453,691 instructions # 2.16 insn per cycle + 1.765393243 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2177) (512y: 100) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest.exe @@ -172,14 +172,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.180760e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.499677e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.499677e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.152598e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.487658e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.487658e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.628134 sec - 5,271,469,183 cycles # 2.010 GHz - 7,421,011,252 instructions # 1.41 insn per cycle - 2.649372413 seconds time elapsed +TOTAL : 2.676718 sec + 5,425,832,216 cycles # 2.018 GHz + 7,519,714,033 instructions # 1.39 insn per cycle + 2.690815651 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1133) (512y: 122) (512z: 1711) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt index 00437ee654..639aca4e98 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt @@ -35,7 +35,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2023-07-20_21:52:58 +DATE: 2023-06-16_23:19:50 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -47,14 +47,14 @@ WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublo Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.926299e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.850691e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.850691e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.044204e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.996138e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.996138e+07 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.875167 sec - 3,304,824,103 cycles # 2.927 GHz - 4,663,734,046 instructions # 1.41 insn per cycle - 1.188015020 seconds time elapsed +TOTAL : 0.864809 sec + 3,286,181,196 cycles # 2.941 GHz + 4,692,253,429 instructions # 1.43 insn per cycle + 1.175702347 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost @@ -77,15 +77,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.948919e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.011346e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.011346e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.945517e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.007133e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.007133e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 5.567531 sec - 17,203,561,499 cycles # 3.088 GHz - 45,582,230,516 instructions # 2.65 insn per cycle - 5.575165917 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 624) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 5.579973 sec + 17,197,411,796 cycles # 3.079 GHz + 45,599,315,728 instructions # 2.65 insn per cycle + 5.587453353 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 625) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -104,15 +104,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.433557e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.642940e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.642940e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.516201e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.735806e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.735806e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.239589 sec - 10,071,146,912 cycles # 3.103 GHz - 27,498,259,382 instructions # 2.73 insn per cycle - 3.253276264 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2528) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.173720 sec + 9,799,980,850 cycles # 3.084 GHz + 26,760,156,296 instructions # 2.73 insn per cycle + 3.191189055 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2475) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -131,15 +131,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.909121e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.555755e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.555755e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.004187e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.680852e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.680852e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.961762 sec - 5,740,889,738 cycles # 2.918 GHz - 11,908,955,349 instructions # 2.07 insn per cycle - 1.975864642 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2391) (512y: 0) (512z: 0) +TOTAL : 1.934049 sec + 5,690,588,652 cycles # 2.933 GHz + 11,606,672,585 instructions # 2.04 insn per cycle + 1.948751225 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2317) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -158,15 +158,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.429737e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.198297e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.198297e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.578597e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.386711e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.386711e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.818718 sec - 5,324,447,602 cycles # 2.918 GHz - 11,333,453,336 instructions # 2.13 insn per cycle - 1.832254547 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2177) (512y: 100) (512z: 0) +TOTAL : 1.781882 sec + 5,244,605,674 cycles # 2.934 GHz + 11,027,240,287 instructions # 2.10 insn per cycle + 1.797644688 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2116) (512y: 84) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -185,15 +185,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.145364e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.453022e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.453022e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.159645e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.469350e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.469350e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.718564 sec - 5,616,717,081 cycles # 2.062 GHz - 7,666,782,003 instructions # 1.36 insn per cycle - 2.735672476 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1133) (512y: 122) (512z: 1711) +TOTAL : 2.712347 sec + 5,634,887,558 cycles # 2.074 GHz + 7,322,991,974 instructions # 1.30 insn per cycle + 2.729029702 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1084) (512y: 95) (512z: 1629) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt index b34f61f463..654e369bcd 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt @@ -35,7 +35,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2023-07-20_22:05:20 +DATE: 2023-06-16_23:32:09 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -43,14 +43,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.546552e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.155983e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.273777e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.720407e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.158709e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.266813e+08 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 0.644527 sec - 2,574,801,540 cycles # 2.918 GHz - 3,380,374,903 instructions # 1.31 insn per cycle - 0.940442343 seconds time elapsed +TOTAL : 0.643304 sec + 2,557,243,803 cycles # 2.902 GHz + 3,323,241,435 instructions # 1.30 insn per cycle + 0.940843234 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --common ==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -68,15 +68,15 @@ Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.967098e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.030972e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.030972e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.976660e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.040382e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.040382e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 5.498792 sec - 17,031,144,653 cycles # 3.095 GHz - 45,523,541,105 instructions # 2.67 insn per cycle - 5.505049605 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 624) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 5.473136 sec + 17,003,987,037 cycles # 3.104 GHz + 45,540,260,291 instructions # 2.68 insn per cycle + 5.479696030 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 625) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -94,15 +94,15 @@ Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.454490e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.671509e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.671509e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.560915e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.788844e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.788844e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 3.203110 sec - 9,890,713,540 cycles # 3.083 GHz - 27,315,954,891 instructions # 2.76 insn per cycle - 3.215846459 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2528) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.112416 sec + 9,646,257,782 cycles # 3.095 GHz + 26,577,811,209 instructions # 2.76 insn per cycle + 3.124182275 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2475) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -120,15 +120,15 @@ Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.990317e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.660625e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.660625e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.139210e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.855962e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.855962e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.918329 sec - 5,584,944,734 cycles # 2.904 GHz - 11,606,463,802 instructions # 2.08 insn per cycle - 1.930027759 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2391) (512y: 0) (512z: 0) +TOTAL : 1.874805 sec + 5,513,978,865 cycles # 2.934 GHz + 11,304,407,461 instructions # 2.05 insn per cycle + 1.886432659 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2317) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -146,15 +146,15 @@ Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.550343e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.355958e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.355958e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.726123e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.568760e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.568760e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.770391 sec - 5,184,081,835 cycles # 2.920 GHz - 10,997,458,515 instructions # 2.12 insn per cycle - 1.782245625 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2177) (512y: 100) (512z: 0) +TOTAL : 1.727887 sec + 5,070,617,194 cycles # 2.927 GHz + 10,691,249,015 instructions # 2.11 insn per cycle + 1.739511158 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2116) (512y: 84) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -172,15 +172,15 @@ Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.232194e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.556099e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.556099e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.172539e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.488358e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.488358e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 2.646573 sec - 5,455,025,324 cycles # 2.057 GHz - 7,373,601,801 instructions # 1.35 insn per cycle - 2.658871533 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1133) (512y: 122) (512z: 1711) +TOTAL : 2.683562 sec + 5,459,517,935 cycles # 2.032 GHz + 7,027,784,941 instructions # 1.29 insn per cycle + 2.699812290 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1084) (512y: 95) (512z: 1629) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_curhst.txt index e969b839ed..a8675ddd60 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_curhst.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_curhst.txt @@ -35,7 +35,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2023-07-20_22:02:17 +DATE: 2023-06-16_23:29:07 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -43,14 +43,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.553051e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.150039e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.266246e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.723347e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.157467e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.265955e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.582164 sec - 2,386,654,206 cycles # 2.907 GHz - 3,345,195,900 instructions # 1.40 insn per cycle - 0.878757760 seconds time elapsed +TOTAL : 0.586317 sec + 2,370,587,754 cycles # 2.873 GHz + 3,311,584,988 instructions # 1.40 insn per cycle + 0.883202740 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --curhst ==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -68,15 +68,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.952789e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.015565e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.015565e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.965968e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.028993e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.028993e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 5.478519 sec - 16,848,760,257 cycles # 3.074 GHz - 45,504,656,377 instructions # 2.70 insn per cycle - 5.485153122 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 624) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 5.442844 sec + 16,814,668,007 cycles # 3.088 GHz + 45,523,651,040 instructions # 2.71 insn per cycle + 5.449380623 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 625) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -94,15 +94,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.443064e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.656427e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.656427e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.557389e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.784830e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.784830e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.152846 sec - 9,719,631,346 cycles # 3.078 GHz - 27,312,840,509 instructions # 2.81 insn per cycle - 3.165688986 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2528) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.056098 sec + 9,441,724,564 cycles # 3.085 GHz + 26,574,286,961 instructions # 2.81 insn per cycle + 3.067987765 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2475) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -120,15 +120,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.054961e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.728796e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.728796e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.028635e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.705882e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.705882e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.840946 sec - 5,381,544,313 cycles # 2.916 GHz - 11,619,866,147 instructions # 2.16 insn per cycle - 1.852693961 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2391) (512y: 0) (512z: 0) +TOTAL : 1.847560 sec + 5,305,013,811 cycles # 2.865 GHz + 11,318,072,423 instructions # 2.13 insn per cycle + 1.862970960 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2317) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -146,15 +146,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.602760e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.407261e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.407261e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.705810e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.546514e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.546514e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.695443 sec - 4,961,496,665 cycles # 2.919 GHz - 11,044,689,545 instructions # 2.23 insn per cycle - 1.707292384 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2177) (512y: 100) (512z: 0) +TOTAL : 1.670710 sec + 4,873,672,906 cycles # 2.909 GHz + 10,738,237,712 instructions # 2.20 insn per cycle + 1.682858352 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2116) (512y: 84) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -172,15 +172,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.246297e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.570618e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.570618e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.212444e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.528464e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.528464e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.576610 sec - 5,242,231,802 cycles # 2.032 GHz - 7,420,825,981 instructions # 1.42 insn per cycle - 2.593274557 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1133) (512y: 122) (512z: 1711) +TOTAL : 2.596550 sec + 5,275,280,525 cycles # 2.028 GHz + 7,074,607,503 instructions # 1.34 insn per cycle + 2.603068795 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1084) (512y: 95) (512z: 1629) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt index eadf74aa63..359002bbc2 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt @@ -35,7 +35,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2023-07-20_21:59:18 +DATE: 2023-06-16_23:26:08 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -44,14 +44,14 @@ WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.897591e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.156701e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.272990e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.030270e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.154608e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.262941e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.765804 sec - 2,966,751,079 cycles # 2.954 GHz - 4,210,078,628 instructions # 1.42 insn per cycle - 1.061943999 seconds time elapsed +TOTAL : 0.762017 sec + 2,931,874,448 cycles # 2.934 GHz + 4,162,111,608 instructions # 1.42 insn per cycle + 1.058050965 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost ==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 @@ -70,15 +70,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.949184e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.013789e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.013789e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.969731e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.034215e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.034215e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 5.487398 sec - 16,849,883,250 cycles # 3.070 GHz - 45,504,539,519 instructions # 2.70 insn per cycle - 5.493810932 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 624) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 5.431365 sec + 16,808,611,250 cycles # 3.092 GHz + 45,520,552,491 instructions # 2.71 insn per cycle + 5.438033144 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 625) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -96,15 +96,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.443470e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.655923e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.655923e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.568471e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.797188e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.797188e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.154402 sec - 9,721,759,429 cycles # 3.080 GHz - 27,313,276,831 instructions # 2.81 insn per cycle - 3.165855874 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2528) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.046374 sec + 9,446,582,684 cycles # 3.097 GHz + 26,574,434,074 instructions # 2.81 insn per cycle + 3.058800051 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2475) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -122,15 +122,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.037386e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.702141e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.702141e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.022985e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.704154e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.704154e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.844533 sec - 5,391,904,010 cycles # 2.916 GHz - 11,619,902,050 instructions # 2.16 insn per cycle - 1.856384671 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2391) (512y: 0) (512z: 0) +TOTAL : 1.849517 sec + 5,358,628,341 cycles # 2.890 GHz + 11,318,219,306 instructions # 2.11 insn per cycle + 1.862335681 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2317) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -148,15 +148,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.579084e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.370438e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.370438e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.716217e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.550859e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.550859e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.701102 sec - 4,967,583,215 cycles # 2.912 GHz - 11,044,550,617 instructions # 2.22 insn per cycle - 1.712970556 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2177) (512y: 100) (512z: 0) +TOTAL : 1.668825 sec + 4,888,461,793 cycles # 2.922 GHz + 10,738,236,069 instructions # 2.20 insn per cycle + 1.680870974 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2116) (512y: 84) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -174,15 +174,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.215349e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.535984e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.535984e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.203955e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.527248e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.527248e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.596179 sec - 5,260,486,062 cycles # 2.023 GHz - 7,420,872,251 instructions # 1.41 insn per cycle - 2.602976893 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1133) (512y: 122) (512z: 1711) +TOTAL : 2.602140 sec + 5,278,828,196 cycles # 2.025 GHz + 7,074,793,601 instructions # 1.34 insn per cycle + 2.608554211 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1084) (512y: 95) (512z: 1629) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt index ce5ba45f1a..eac3f7700f 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt @@ -35,7 +35,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2023-07-20_21:06:06 +DATE: 2023-06-16_22:51:35 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -43,14 +43,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.998352e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.124151e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.260557e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.913985e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.133710e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.264879e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.603844 sec - 2,309,713,417 cycles # 2.854 GHz - 2,930,114,843 instructions # 1.27 insn per cycle - 1.030459539 seconds time elapsed +TOTAL : 0.547036 sec + 2,265,211,371 cycles # 2.879 GHz + 2,883,998,397 instructions # 1.27 insn per cycle + 0.846249580 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 208 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -68,14 +68,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.011050e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.078425e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.078425e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.025487e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.092839e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.092839e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 5.323633 sec - 16,441,170,315 cycles # 3.085 GHz - 44,495,110,833 instructions # 2.71 insn per cycle - 5.354507659 seconds time elapsed +TOTAL : 5.293594 sec + 16,435,613,118 cycles # 3.105 GHz + 44,496,848,749 instructions # 2.71 insn per cycle + 5.300078017 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 576) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/runTest.exe @@ -94,15 +94,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.695693e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.943557e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.943557e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.743778e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.997210e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.997210e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.946114 sec - 9,118,388,042 cycles # 3.088 GHz - 26,080,206,777 instructions # 2.86 insn per cycle - 3.097145793 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2336) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 2.909580 sec + 8,996,897,610 cycles # 3.087 GHz + 25,400,434,000 instructions # 2.82 insn per cycle + 2.923168620 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2305) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -120,15 +120,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.445608e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.001201e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.001201e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.683039e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.268975e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.268975e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.033234 sec - 5,936,288,666 cycles # 2.911 GHz - 13,130,180,553 instructions # 2.21 insn per cycle - 2.299617104 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2556) (512y: 0) (512z: 0) +TOTAL : 1.953170 sec + 5,745,024,701 cycles # 2.934 GHz + 12,420,701,802 instructions # 2.16 insn per cycle + 1.968877475 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2408) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -146,15 +146,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.697120e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.297325e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.297325e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.895401e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.531291e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.531291e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.948602 sec - 5,690,852,369 cycles # 2.911 GHz - 12,726,971,997 instructions # 2.24 insn per cycle - 2.109721852 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2251) (512y: 275) (512z: 0) +TOTAL : 1.886801 sec + 5,519,025,390 cycles # 2.918 GHz + 12,000,810,261 instructions # 2.17 insn per cycle + 1.892947629 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2127) (512y: 239) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -172,15 +172,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.998589e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.284698e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.284698e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.060444e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.357914e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.357914e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.730846 sec - 5,536,346,809 cycles # 2.023 GHz - 9,255,690,681 instructions # 1.67 insn per cycle - 2.846386984 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1105) (512y: 224) (512z: 1900) +TOTAL : 2.690681 sec + 5,453,449,759 cycles # 2.024 GHz + 8,526,907,153 instructions # 1.56 insn per cycle + 2.696635390 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1067) (512y: 204) (512z: 1715) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt index f2aef146c7..e46f7db696 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt @@ -35,7 +35,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2023-07-20_21:43:05 +DATE: 2023-06-16_23:10:25 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -43,14 +43,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.279755e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.156089e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.273611e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.728254e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.165396e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.275486e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.557758 sec - 2,321,832,135 cycles # 2.895 GHz - 2,934,847,494 instructions # 1.26 insn per cycle - 0.859841368 seconds time elapsed +TOTAL : 0.552941 sec + 2,310,610,153 cycles # 2.859 GHz + 2,915,241,692 instructions # 1.26 insn per cycle + 0.867252595 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/gcheck.exe -p 2048 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -68,15 +68,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.582180e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.692447e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.692447e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.605414e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.718612e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.718612e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 4.169099 sec - 12,884,243,812 cycles # 3.088 GHz - 34,451,443,411 instructions # 2.67 insn per cycle - 4.176090940 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 680) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.132902 sec + 12,781,577,555 cycles # 3.090 GHz + 34,468,010,618 instructions # 2.70 insn per cycle + 4.139327277 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 672) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -94,15 +94,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.180793e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.361370e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.361370e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.146546e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.322919e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.322919e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.413092 sec - 10,536,089,302 cycles # 3.087 GHz - 23,575,963,151 instructions # 2.24 insn per cycle - 3.430021980 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2580) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.441664 sec + 10,637,428,563 cycles # 3.088 GHz + 22,946,767,172 instructions # 2.16 insn per cycle + 3.453622149 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2554) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -120,15 +120,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.470144e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.018737e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.018737e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.616486e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.203845e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.203845e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.023931 sec - 5,944,485,095 cycles # 2.929 GHz - 11,674,858,192 instructions # 1.96 insn per cycle - 2.036797598 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2946) (512y: 0) (512z: 0) +TOTAL : 1.975506 sec + 5,785,528,635 cycles # 2.924 GHz + 10,765,799,966 instructions # 1.86 insn per cycle + 1.987834132 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2694) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -136,8 +136,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388516200 -Relative difference = 3.2588037208240405e-07 +Avg ME (F77/C++) = 2.0288063388516204 +Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/check.exe -p 2048 256 2 OMP= @@ -146,15 +146,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.538017e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.097276e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.097276e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.684700e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.274843e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.274843e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.001545 sec - 5,800,472,741 cycles # 2.890 GHz - 10,795,230,007 instructions # 1.86 insn per cycle - 2.008077471 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2489) (512y: 212) (512z: 0) +TOTAL : 1.951945 sec + 5,696,598,180 cycles # 2.912 GHz + 9,993,327,583 instructions # 1.75 insn per cycle + 1.964406649 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2323) (512y: 159) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -162,8 +162,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388516200 -Relative difference = 3.2588037208240405e-07 +Avg ME (F77/C++) = 2.0288063388516204 +Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/check.exe -p 2048 256 2 OMP= @@ -172,15 +172,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.357221e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.697617e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.697617e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.413791e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.764783e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.764783e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.514014 sec - 5,140,335,603 cycles # 2.041 GHz - 8,459,272,146 instructions # 1.65 insn per cycle - 2.521134536 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1781) (512y: 292) (512z: 1897) +TOTAL : 2.484051 sec + 5,055,557,870 cycles # 2.031 GHz + 7,607,210,183 instructions # 1.50 insn per cycle + 2.496248175 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1617) (512y: 257) (512z: 1663) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt index 5797bdb3f5..336c6b2c5b 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt @@ -35,7 +35,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2023-07-20_21:43:31 +DATE: 2023-06-16_23:10:52 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -43,14 +43,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.284928e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.155357e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.270779e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.704482e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.154812e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.264069e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.554293 sec - 2,314,131,194 cycles # 2.894 GHz - 2,889,755,564 instructions # 1.25 insn per cycle - 0.857897683 seconds time elapsed +TOTAL : 0.548991 sec + 2,291,931,243 cycles # 2.897 GHz + 2,895,980,863 instructions # 1.26 insn per cycle + 0.849095215 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/gcheck.exe -p 2048 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 208 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -68,15 +68,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.640742e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.757240e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.757240e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.696861e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.817621e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.817621e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 4.081626 sec - 12,433,349,505 cycles # 3.042 GHz - 35,154,422,765 instructions # 2.83 insn per cycle - 4.088491780 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 456) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.995865 sec + 12,393,440,631 cycles # 3.099 GHz + 35,121,655,729 instructions # 2.83 insn per cycle + 4.002434492 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 458) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -94,15 +94,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.179152e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.359270e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.359270e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.203775e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.393294e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.393294e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.406086 sec - 10,558,127,928 cycles # 3.094 GHz - 22,542,201,173 instructions # 2.14 insn per cycle - 3.413057800 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2334) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.383474 sec + 10,433,445,794 cycles # 3.079 GHz + 22,106,457,751 instructions # 2.12 insn per cycle + 3.399221962 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2351) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -120,15 +120,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.910082e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.563050e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.563050e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.959707e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.624477e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.624477e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.882529 sec - 5,536,469,415 cycles # 2.932 GHz - 11,147,438,636 instructions # 2.01 insn per cycle - 1.894771419 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2332) (512y: 0) (512z: 0) +TOTAL : 1.874404 sec + 5,449,857,591 cycles # 2.908 GHz + 10,370,988,020 instructions # 1.90 insn per cycle + 1.886802544 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2170) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -146,15 +146,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.760206e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.379868e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.379868e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.940537e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.590919e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.590919e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.927727 sec - 5,611,296,725 cycles # 2.903 GHz - 10,403,071,856 instructions # 1.85 insn per cycle - 1.934743480 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1995) (512y: 157) (512z: 0) +TOTAL : 1.873958 sec + 5,488,219,503 cycles # 2.922 GHz + 9,577,123,391 instructions # 1.75 insn per cycle + 1.890504897 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1857) (512y: 115) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -172,15 +172,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.497092e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.863642e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.863642e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.678748e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.078914e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.078914e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.440279 sec - 4,967,221,925 cycles # 2.032 GHz - 8,222,464,433 instructions # 1.66 insn per cycle - 2.452221377 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1423) (512y: 220) (512z: 1508) +TOTAL : 2.351438 sec + 4,817,365,178 cycles # 2.046 GHz + 7,401,125,646 instructions # 1.54 insn per cycle + 2.365073669 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1298) (512y: 193) (512z: 1369) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt index 66be2b8946..51b62d4486 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt @@ -35,7 +35,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2023-07-20_21:06:35 +DATE: 2023-06-16_22:52:02 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -43,14 +43,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.280257e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.598534e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.981267e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.086859e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.694248e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.954158e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086718e+00 +- 3.413389e-03 ) GeV^0 -TOTAL : 0.576006 sec - 2,178,385,850 cycles # 2.872 GHz - 2,669,198,126 instructions # 1.23 insn per cycle - 1.061814200 seconds time elapsed +TOTAL : 0.501803 sec + 2,114,448,212 cycles # 2.892 GHz + 2,650,449,805 instructions # 1.25 insn per cycle + 0.791582379 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 128 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -68,15 +68,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.051035e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.109607e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.109607e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.045424e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.103487e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.103487e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 5.199720 sec - 16,075,811,088 cycles # 3.089 GHz - 45,313,536,642 instructions # 2.82 insn per cycle - 5.225703980 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 626) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 5.214918 sec + 16,086,691,839 cycles # 3.084 GHz + 45,264,306,297 instructions # 2.81 insn per cycle + 5.221298921 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 630) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -94,15 +94,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.935471e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.324951e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.324951e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.087937e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.510673e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.510673e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086781e+00 +- 3.413806e-03 ) GeV^0 -TOTAL : 2.230888 sec - 6,845,063,261 cycles # 3.083 GHz - 17,292,911,963 instructions # 2.53 insn per cycle - 2.318223391 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 3182) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 2.147379 sec + 6,633,404,493 cycles # 3.083 GHz + 16,691,710,310 instructions # 2.52 insn per cycle + 2.153586806 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 3123) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -120,15 +120,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.011475e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.178585e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.178585e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.082531e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.272863e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.272863e+06 ) sec^-1 MeanMatrixElemValue = ( 2.086812e+00 +- 3.414242e-03 ) GeV^0 -TOTAL : 1.125721 sec - 3,299,489,082 cycles # 2.918 GHz - 7,320,764,209 instructions # 2.22 insn per cycle - 1.292961901 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2868) (512y: 0) (512z: 0) +TOTAL : 1.053599 sec + 3,105,351,321 cycles # 2.935 GHz + 7,028,445,226 instructions # 2.26 insn per cycle + 1.065166686 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2735) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -146,15 +146,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.066400e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.254694e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.254694e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.144769e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.363322e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.363322e+06 ) sec^-1 MeanMatrixElemValue = ( 2.086812e+00 +- 3.414242e-03 ) GeV^0 -TOTAL : 1.071144 sec - 3,118,800,415 cycles # 2.894 GHz - 6,982,391,368 instructions # 2.24 insn per cycle - 1.256112603 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2710) (512y: 12) (512z: 0) +TOTAL : 1.001609 sec + 2,959,023,588 cycles # 2.940 GHz + 6,742,632,212 instructions # 2.28 insn per cycle + 1.013857674 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2593) (512y: 12) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -172,15 +172,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.672831e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.556856e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.556856e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.162662e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.186029e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.186029e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086812e+00 +- 3.414242e-03 ) GeV^0 -TOTAL : 1.453486 sec - 3,050,353,960 cycles # 2.090 GHz - 5,144,038,995 instructions # 1.69 insn per cycle - 1.511154293 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1824) (512y: 30) (512z: 1955) +TOTAL : 1.370260 sec + 2,871,098,848 cycles # 2.089 GHz + 4,848,815,662 instructions # 1.69 insn per cycle + 1.382844305 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1722) (512y: 22) (512z: 1849) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt index 2c21e200ee..500d268665 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt @@ -35,7 +35,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2023-07-20_21:53:26 +DATE: 2023-06-16_23:20:18 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -47,14 +47,14 @@ WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublo Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.958165e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.569098e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.569098e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.110307e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.876714e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.876714e+07 ) sec^-1 MeanMatrixElemValue = ( 2.086808e+00 +- 3.414090e-03 ) GeV^0 -TOTAL : 0.701037 sec - 2,758,853,932 cycles # 2.901 GHz - 3,866,843,604 instructions # 1.40 insn per cycle - 1.009181417 seconds time elapsed +TOTAL : 0.696990 sec + 2,704,320,756 cycles # 2.897 GHz + 3,812,649,511 instructions # 1.41 insn per cycle + 0.992471982 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost @@ -77,15 +77,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.038993e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.096878e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.096878e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.034027e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.092499e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.092499e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 5.272567 sec - 16,263,091,912 cycles # 3.082 GHz - 45,361,762,688 instructions # 2.79 insn per cycle - 5.280315806 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 626) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 5.291084 sec + 16,288,379,429 cycles # 3.078 GHz + 45,312,040,286 instructions # 2.78 insn per cycle + 5.298079548 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 630) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -104,15 +104,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.898361e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.275323e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.275323e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.061059e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.467740e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.467740e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086781e+00 +- 3.413806e-03 ) GeV^0 -TOTAL : 2.274757 sec - 7,040,627,423 cycles # 3.087 GHz - 17,573,092,954 instructions # 2.50 insn per cycle - 2.288078283 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 3182) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 2.203734 sec + 6,829,055,399 cycles # 3.092 GHz + 16,972,646,719 instructions # 2.49 insn per cycle + 2.216688904 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 3123) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -131,15 +131,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.962611e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.157504e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.157504e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.061515e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.242839e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.242839e+06 ) sec^-1 MeanMatrixElemValue = ( 2.086812e+00 +- 3.414242e-03 ) GeV^0 -TOTAL : 1.183658 sec - 3,498,617,091 cycles # 2.940 GHz - 7,558,058,575 instructions # 2.16 insn per cycle - 1.196396802 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2868) (512y: 0) (512z: 0) +TOTAL : 1.119383 sec + 3,302,419,161 cycles # 2.935 GHz + 7,266,282,160 instructions # 2.20 insn per cycle + 1.131841634 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2735) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -158,15 +158,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.062066e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.245829e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.245829e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.096474e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.293763e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.293763e+06 ) sec^-1 MeanMatrixElemValue = ( 2.086812e+00 +- 3.414242e-03 ) GeV^0 -TOTAL : 1.118710 sec - 3,308,081,890 cycles # 2.940 GHz - 7,219,732,778 instructions # 2.18 insn per cycle - 1.131416278 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2710) (512y: 12) (512z: 0) +TOTAL : 1.088730 sec + 3,150,094,724 cycles # 2.877 GHz + 6,980,573,007 instructions # 2.22 insn per cycle + 1.104462726 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2593) (512y: 12) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -185,15 +185,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.555201e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.423008e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.423008e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.080793e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.067489e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.067489e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086812e+00 +- 3.414242e-03 ) GeV^0 -TOTAL : 1.521139 sec - 3,254,818,324 cycles # 2.130 GHz - 5,399,035,135 instructions # 1.66 insn per cycle - 1.528830619 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1824) (512y: 30) (512z: 1955) +TOTAL : 1.430225 sec + 3,084,429,817 cycles # 2.147 GHz + 5,103,946,062 instructions # 1.65 insn per cycle + 1.443126402 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1722) (512y: 22) (512z: 1849) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt index 490aee355a..a6492e3922 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt @@ -35,7 +35,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2023-07-20_22:05:48 +DATE: 2023-06-16_23:32:37 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -43,14 +43,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.363180e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.631268e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.952491e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.808067e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.652444e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.960520e+08 ) sec^-1 MeanMatrixElemValue = ( 2.079446e+00 +- 3.403306e-03 ) GeV^0 -TOTAL : 0.587204 sec - 2,368,791,568 cycles # 2.891 GHz - 3,091,871,506 instructions # 1.31 insn per cycle - 0.877101620 seconds time elapsed +TOTAL : 0.588246 sec + 2,402,908,432 cycles # 2.901 GHz + 3,110,614,158 instructions # 1.29 insn per cycle + 0.886196949 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --common ==PROF== Profiling "sigmaKin": launch__registers_per_thread 128 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -68,15 +68,15 @@ Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.049923e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.108243e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.108243e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.033763e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.091227e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.091227e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079573e+00 +- 3.404712e-03 ) GeV^0 -TOTAL : 5.256103 sec - 16,234,632,942 cycles # 3.087 GHz - 45,340,296,546 instructions # 2.79 insn per cycle - 5.262048569 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 626) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 5.296990 sec + 16,269,407,762 cycles # 3.070 GHz + 45,296,730,260 instructions # 2.78 insn per cycle + 5.303386291 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 630) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -94,15 +94,15 @@ Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.891993e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.270729e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.270729e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.111672e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.523499e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.523499e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079574e+00 +- 3.404724e-03 ) GeV^0 -TOTAL : 2.284631 sec - 7,011,039,110 cycles # 3.063 GHz - 17,305,883,397 instructions # 2.47 insn per cycle - 2.290768852 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 3182) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 2.191606 sec + 6,792,051,649 cycles # 3.094 GHz + 16,705,051,298 instructions # 2.46 insn per cycle + 2.206896271 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 3123) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -120,15 +120,15 @@ Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.016646e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.184088e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.184088e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.084154e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.276390e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.276390e+06 ) sec^-1 MeanMatrixElemValue = ( 2.079552e+00 +- 3.404217e-03 ) GeV^0 -TOTAL : 1.170907 sec - 3,454,043,441 cycles # 2.937 GHz - 7,305,123,696 instructions # 2.11 insn per cycle - 1.188152602 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2868) (512y: 0) (512z: 0) +TOTAL : 1.106630 sec + 3,283,893,034 cycles # 2.956 GHz + 7,013,364,350 instructions # 2.14 insn per cycle + 1.121623240 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2735) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -146,15 +146,15 @@ Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.082758e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.272281e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.272281e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.140251e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.355699e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.355699e+06 ) sec^-1 MeanMatrixElemValue = ( 2.079552e+00 +- 3.404217e-03 ) GeV^0 -TOTAL : 1.110011 sec - 3,277,503,529 cycles # 2.939 GHz - 6,933,525,551 instructions # 2.12 insn per cycle - 1.122782503 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2710) (512y: 12) (512z: 0) +TOTAL : 1.061058 sec + 3,126,583,899 cycles # 2.934 GHz + 6,694,705,597 instructions # 2.14 insn per cycle + 1.076779612 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2593) (512y: 12) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -172,15 +172,15 @@ Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.713861e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.609163e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.609163e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.180445e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.199369e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.199369e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079552e+00 +- 3.404217e-03 ) GeV^0 -TOTAL : 1.500066 sec - 3,210,575,653 cycles # 2.133 GHz - 5,095,096,711 instructions # 1.59 insn per cycle - 1.511953034 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1824) (512y: 30) (512z: 1955) +TOTAL : 1.423127 sec + 3,043,448,384 cycles # 2.132 GHz + 4,800,221,178 instructions # 1.58 insn per cycle + 1.434967626 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1722) (512y: 22) (512z: 1849) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_curhst.txt index 23da96bac1..5b694fb236 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_curhst.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_curhst.txt @@ -35,7 +35,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2023-07-20_22:02:45 +DATE: 2023-06-16_23:29:34 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -43,14 +43,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.368536e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.617380e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.935182e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.836019e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.651430e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.966312e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086718e+00 +- 3.413389e-03 ) GeV^0 -TOTAL : 0.535526 sec - 2,199,164,699 cycles # 2.880 GHz - 3,057,074,391 instructions # 1.39 insn per cycle - 0.821337729 seconds time elapsed +TOTAL : 0.535629 sec + 2,203,011,013 cycles # 2.886 GHz + 3,060,363,719 instructions # 1.39 insn per cycle + 0.822111510 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --curhst ==PROF== Profiling "sigmaKin": launch__registers_per_thread 128 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -68,15 +68,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.060441e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.120038e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.120038e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.025315e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.083260e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.083260e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 5.181233 sec - 16,068,023,342 cycles # 3.101 GHz - 45,314,566,058 instructions # 2.82 insn per cycle - 5.187449483 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 626) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 5.275218 sec + 16,091,710,481 cycles # 3.052 GHz + 45,265,017,659 instructions # 2.81 insn per cycle + 5.281528258 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 630) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -94,15 +94,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.929887e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.311185e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.311185e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.134361e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.551581e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.551581e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086781e+00 +- 3.413806e-03 ) GeV^0 -TOTAL : 2.213215 sec - 6,838,534,608 cycles # 3.083 GHz - 17,292,273,299 instructions # 2.53 insn per cycle - 2.225484253 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 3182) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 2.128639 sec + 6,626,366,458 cycles # 3.106 GHz + 16,691,617,090 instructions # 2.52 insn per cycle + 2.144077506 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 3123) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -120,15 +120,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.025754e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.194251e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.194251e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.089985e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.279320e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.279320e+06 ) sec^-1 MeanMatrixElemValue = ( 2.086812e+00 +- 3.414242e-03 ) GeV^0 -TOTAL : 1.108322 sec - 3,276,999,534 cycles # 2.942 GHz - 7,320,267,712 instructions # 2.23 insn per cycle - 1.121154503 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2868) (512y: 0) (512z: 0) +TOTAL : 1.046794 sec + 3,097,881,265 cycles # 2.946 GHz + 7,028,413,867 instructions # 2.27 insn per cycle + 1.062080365 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2735) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -146,15 +146,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.052668e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.235070e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.235070e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.151751e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.366544e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.366544e+06 ) sec^-1 MeanMatrixElemValue = ( 2.086812e+00 +- 3.414242e-03 ) GeV^0 -TOTAL : 1.084322 sec - 3,113,639,942 cycles # 2.857 GHz - 6,982,152,522 instructions # 2.24 insn per cycle - 1.120990778 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2710) (512y: 12) (512z: 0) +TOTAL : 0.995620 sec + 2,939,121,938 cycles # 2.938 GHz + 6,742,732,632 instructions # 2.29 insn per cycle + 1.001882555 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2593) (512y: 12) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -172,15 +172,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.538797e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.410356e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.410356e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.156539e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.166833e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.166833e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086812e+00 +- 3.414242e-03 ) GeV^0 -TOTAL : 1.476876 sec - 3,040,687,373 cycles # 2.052 GHz - 5,143,834,780 instructions # 1.69 insn per cycle - 1.492448957 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1824) (512y: 30) (512z: 1955) +TOTAL : 1.371454 sec + 2,896,780,603 cycles # 2.105 GHz + 4,848,892,795 instructions # 1.67 insn per cycle + 1.377614632 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1722) (512y: 22) (512z: 1849) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt index 246d432e18..f142e268bd 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt @@ -35,7 +35,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2023-07-20_21:59:45 +DATE: 2023-06-16_23:26:35 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -44,14 +44,14 @@ WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.727463e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.645992e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.969326e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.844906e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.667637e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.968555e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086808e+00 +- 3.414090e-03 ) GeV^0 -TOTAL : 0.641456 sec - 2,529,945,672 cycles # 2.910 GHz - 3,543,183,507 instructions # 1.40 insn per cycle - 0.927396201 seconds time elapsed +TOTAL : 0.637578 sec + 2,532,069,078 cycles # 2.919 GHz + 3,546,459,768 instructions # 1.40 insn per cycle + 0.925210240 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost ==PROF== Profiling "sigmaKin": launch__registers_per_thread 128 @@ -70,15 +70,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.039860e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.097470e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.097470e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.040228e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.098157e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.098157e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 5.227371 sec - 16,075,379,708 cycles # 3.073 GHz - 45,314,337,770 instructions # 2.82 insn per cycle - 5.233815191 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 626) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 5.228615 sec + 16,104,862,870 cycles # 3.079 GHz + 45,267,189,999 instructions # 2.81 insn per cycle + 5.234886365 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 630) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -96,15 +96,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.931026e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.312724e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.312724e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.110524e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.524586e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.524586e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086781e+00 +- 3.413806e-03 ) GeV^0 -TOTAL : 2.213840 sec - 6,832,987,497 cycles # 3.081 GHz - 17,292,491,583 instructions # 2.53 insn per cycle - 2.225767243 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 3182) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 2.139950 sec + 6,622,300,033 cycles # 3.088 GHz + 16,691,797,560 instructions # 2.52 insn per cycle + 2.152617384 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 3123) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -122,15 +122,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.810725e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.136654e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.136654e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.073753e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.265534e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.265534e+06 ) sec^-1 MeanMatrixElemValue = ( 2.086812e+00 +- 3.414242e-03 ) GeV^0 -TOTAL : 1.156085 sec - 3,294,193,875 cycles # 2.837 GHz - 7,321,286,340 instructions # 2.22 insn per cycle - 1.168681953 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2868) (512y: 0) (512z: 0) +TOTAL : 1.063032 sec + 3,116,892,646 cycles # 2.921 GHz + 7,028,647,548 instructions # 2.26 insn per cycle + 1.074679609 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2735) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -148,15 +148,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.081060e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.269776e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.269776e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.147777e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.361400e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.361400e+06 ) sec^-1 MeanMatrixElemValue = ( 2.086812e+00 +- 3.414242e-03 ) GeV^0 -TOTAL : 1.056524 sec - 3,110,658,041 cycles # 2.932 GHz - 6,982,071,978 instructions # 2.24 insn per cycle - 1.068066659 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2710) (512y: 12) (512z: 0) +TOTAL : 0.998684 sec + 2,942,336,224 cycles # 2.933 GHz + 6,742,794,858 instructions # 2.29 insn per cycle + 1.014229588 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2593) (512y: 12) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -174,15 +174,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.244474e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.093831e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.093831e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.156146e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.170100e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.170100e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086812e+00 +- 3.414242e-03 ) GeV^0 -TOTAL : 1.536586 sec - 3,046,728,575 cycles # 1.977 GHz - 5,144,164,200 instructions # 1.69 insn per cycle - 1.565931053 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1824) (512y: 30) (512z: 1955) +TOTAL : 1.371673 sec + 2,870,181,896 cycles # 2.085 GHz + 4,848,797,007 instructions # 1.69 insn per cycle + 1.377996079 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1722) (512y: 22) (512z: 1849) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt index 083156d73a..908359aae9 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt @@ -35,7 +35,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2023-07-20_21:07:01 +DATE: 2023-06-16_22:52:25 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -43,14 +43,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.275508e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.643033e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.038106e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.091832e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.744916e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.014144e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086718e+00 +- 3.413389e-03 ) GeV^0 -TOTAL : 0.508922 sec - 2,137,462,369 cycles # 2.874 GHz - 2,682,038,097 instructions # 1.25 insn per cycle - 1.186052973 seconds time elapsed +TOTAL : 0.502212 sec + 2,095,360,596 cycles # 2.861 GHz + 2,633,922,104 instructions # 1.26 insn per cycle + 0.791606560 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 127 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -68,14 +68,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.043226e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.103745e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.103745e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.080122e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.140133e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.140133e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 5.223958 sec - 15,897,491,217 cycles # 3.040 GHz - 44,490,859,581 instructions # 2.80 insn per cycle - 5.269412195 seconds time elapsed +TOTAL : 5.126985 sec + 15,885,461,434 cycles # 3.097 GHz + 44,491,325,292 instructions # 2.80 insn per cycle + 5.132808718 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 580) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/runTest.exe @@ -94,15 +94,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.973807e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.553775e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.553775e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.233465e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.861527e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.861527e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086781e+00 +- 3.413806e-03 ) GeV^0 -TOTAL : 1.843036 sec - 5,709,294,114 cycles # 3.089 GHz - 16,310,638,189 instructions # 2.86 insn per cycle - 1.918054717 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2889) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 1.768519 sec + 5,514,620,841 cycles # 3.109 GHz + 15,833,995,859 instructions # 2.87 insn per cycle + 1.780397431 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2852) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -120,15 +120,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.915722e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.657306e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.657306e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.563962e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.456105e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.456105e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086812e+00 +- 3.414242e-03 ) GeV^0 -TOTAL : 1.602976 sec - 4,700,668,988 cycles # 2.922 GHz - 9,282,824,737 instructions # 1.97 insn per cycle - 1.694604253 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3555) (512y: 0) (512z: 0) +TOTAL : 1.472161 sec + 4,310,727,148 cycles # 2.921 GHz + 8,709,473,097 instructions # 2.02 insn per cycle + 1.483481957 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3300) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -146,15 +146,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.023406e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.785775e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.785775e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.777637e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.715040e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.715040e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086812e+00 +- 3.414242e-03 ) GeV^0 -TOTAL : 1.580743 sec - 4,625,283,437 cycles # 2.916 GHz - 9,083,220,915 instructions # 1.96 insn per cycle - 1.710629480 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3468) (512y: 2) (512z: 0) +TOTAL : 1.433573 sec + 4,206,371,569 cycles # 2.925 GHz + 8,430,013,752 instructions # 2.00 insn per cycle + 1.439953997 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3203) (512y: 5) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -172,15 +172,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.298566e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.711627e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.711627e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.942233e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.458072e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.458072e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086812e+00 +- 3.414242e-03 ) GeV^0 -TOTAL : 2.091224 sec - 4,121,393,095 cycles # 1.988 GHz - 7,509,793,952 instructions # 1.82 insn per cycle - 2.124248328 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2532) (512y: 0) (512z: 2563) +TOTAL : 1.858685 sec + 3,800,386,936 cycles # 2.045 GHz + 6,743,751,136 instructions # 1.77 insn per cycle + 1.870417357 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2337) (512y: 12) (512z: 2190) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt index f7a17ab031..03f579fc70 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt @@ -35,7 +35,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2023-07-20_21:43:57 +DATE: 2023-06-16_23:11:17 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -43,14 +43,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.231584e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.589988e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.966020e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.788614e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.673918e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.981454e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086718e+00 +- 3.413389e-03 ) GeV^0 -TOTAL : 0.508452 sec - 2,162,385,648 cycles # 2.911 GHz - 2,701,369,448 instructions # 1.25 insn per cycle - 0.800446043 seconds time elapsed +TOTAL : 0.503723 sec + 2,153,847,666 cycles # 2.865 GHz + 2,679,220,576 instructions # 1.24 insn per cycle + 0.809587763 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/gcheck.exe -p 2048 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 128 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -68,15 +68,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.674424e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.774799e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.774799e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.654314e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.753826e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.753826e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 4.009583 sec - 12,379,645,506 cycles # 3.085 GHz - 34,383,759,433 instructions # 2.78 insn per cycle - 4.015988825 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 700) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.037765 sec + 12,382,941,525 cycles # 3.063 GHz + 34,720,031,809 instructions # 2.80 insn per cycle + 4.044257307 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 710) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -94,15 +94,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.732399e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.254960e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.254960e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.839094e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.391075e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.391075e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086781e+00 +- 3.413806e-03 ) GeV^0 -TOTAL : 1.916096 sec - 5,960,514,987 cycles # 3.102 GHz - 14,349,582,097 instructions # 2.41 insn per cycle - 1.928652103 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 3099) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 1.882962 sec + 5,789,032,021 cycles # 3.067 GHz + 13,741,089,000 instructions # 2.37 insn per cycle + 1.901362450 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 3019) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -110,8 +110,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288199018687010 -Relative difference = 4.836865719316925e-08 +Avg ME (F77/C++) = 2.0288199222413823 +Relative difference = 3.8327016574625664e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/check.exe -p 2048 256 2 OMP= @@ -120,15 +120,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.138863e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.153116e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.153116e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.215187e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.053508e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.053508e+06 ) sec^-1 MeanMatrixElemValue = ( 2.086812e+00 +- 3.414242e-03 ) GeV^0 -TOTAL : 1.375198 sec - 4,071,518,344 cycles # 2.949 GHz - 8,260,550,371 instructions # 2.03 insn per cycle - 1.387174511 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4044) (512y: 0) (512z: 0) +TOTAL : 1.223202 sec + 3,599,020,391 cycles # 2.929 GHz + 7,571,835,022 instructions # 2.10 insn per cycle + 1.236020404 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3640) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -136,8 +136,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028819e+00 -Avg ME (F77/C++) = 2.0288187719415873 -Relative difference = 1.1240944243872765e-07 +Avg ME (F77/C++) = 2.0288187717087567 +Relative difference = 1.1252420410236244e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/check.exe -p 2048 256 2 OMP= @@ -146,15 +146,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.192903e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.228243e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.228243e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.176325e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.050726e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.050726e+06 ) sec^-1 MeanMatrixElemValue = ( 2.086812e+00 +- 3.414242e-03 ) GeV^0 -TOTAL : 1.367161 sec - 3,989,235,934 cycles # 2.906 GHz - 7,837,542,357 instructions # 1.96 insn per cycle - 1.380288587 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3804) (512y: 0) (512z: 0) +TOTAL : 1.228546 sec + 3,600,171,901 cycles # 2.920 GHz + 7,138,528,819 instructions # 1.98 insn per cycle + 1.240059403 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3407) (512y: 1) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -162,8 +162,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028819e+00 -Avg ME (F77/C++) = 2.0288187719415873 -Relative difference = 1.1240944243872765e-07 +Avg ME (F77/C++) = 2.0288187717087567 +Relative difference = 1.1252420410236244e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/check.exe -p 2048 256 2 OMP= @@ -172,15 +172,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.266873e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.847476e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.847476e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.044103e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.781585e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.781585e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086812e+00 +- 3.414242e-03 ) GeV^0 -TOTAL : 1.760322 sec - 3,607,803,706 cycles # 2.043 GHz - 6,928,716,427 instructions # 1.92 insn per cycle - 1.771923659 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3892) (512y: 0) (512z: 2500) +TOTAL : 1.574940 sec + 3,264,671,911 cycles # 2.066 GHz + 6,099,502,522 instructions # 1.87 insn per cycle + 1.581189951 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3531) (512y: 0) (512z: 2032) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -188,8 +188,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028819e+00 -Avg ME (F77/C++) = 2.0288188933627680 -Relative difference = 5.256123485318502e-08 +Avg ME (F77/C++) = 2.0288188919657841 +Relative difference = 5.324980483499375e-08 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt index 47005f5c03..fc059049f2 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt @@ -35,7 +35,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2023-07-20_21:44:20 +DATE: 2023-06-16_23:11:40 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -43,14 +43,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.289182e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.634404e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.029488e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.829844e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.682794e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.002681e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086718e+00 +- 3.413389e-03 ) GeV^0 -TOTAL : 0.511719 sec - 2,143,898,219 cycles # 2.876 GHz - 2,675,091,843 instructions # 1.25 insn per cycle - 0.803453231 seconds time elapsed +TOTAL : 0.504050 sec + 2,156,269,977 cycles # 2.889 GHz + 2,661,239,316 instructions # 1.23 insn per cycle + 0.803885518 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/gcheck.exe -p 2048 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 127 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -68,15 +68,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.773999e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.882417e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.882417e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.707277e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.810959e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.810959e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 3.867706 sec - 11,676,477,431 cycles # 3.016 GHz - 35,120,852,671 instructions # 3.01 insn per cycle - 3.874275994 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 475) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.961686 sec + 11,627,305,519 cycles # 2.933 GHz + 34,903,521,773 instructions # 3.00 insn per cycle + 3.967678741 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 465) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -94,15 +94,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.881041e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.431512e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.431512e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.940642e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.518726e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.518726e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086781e+00 +- 3.413806e-03 ) GeV^0 -TOTAL : 1.869975 sec - 5,818,870,742 cycles # 3.103 GHz - 13,982,679,042 instructions # 2.40 insn per cycle - 1.876634194 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2574) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 1.852150 sec + 5,740,254,137 cycles # 3.092 GHz + 13,368,786,182 instructions # 2.33 insn per cycle + 1.858247428 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2503) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -120,15 +120,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.253572e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.296006e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.296006e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.380718e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.076685e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.076685e+06 ) sec^-1 MeanMatrixElemValue = ( 2.086812e+00 +- 3.414242e-03 ) GeV^0 -TOTAL : 1.356090 sec - 4,002,786,171 cycles # 2.940 GHz - 8,019,917,798 instructions # 2.00 insn per cycle - 1.368158113 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3268) (512y: 0) (512z: 0) +TOTAL : 1.203884 sec + 3,531,102,752 cycles # 2.922 GHz + 7,331,328,779 instructions # 2.08 insn per cycle + 1.215736492 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2935) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -146,15 +146,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.389980e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.482058e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.482058e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.449213e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.085295e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.085295e+06 ) sec^-1 MeanMatrixElemValue = ( 2.086812e+00 +- 3.414242e-03 ) GeV^0 -TOTAL : 1.337081 sec - 3,900,947,935 cycles # 2.906 GHz - 7,625,105,914 instructions # 1.95 insn per cycle - 1.349701901 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3024) (512y: 1) (512z: 0) +TOTAL : 1.195397 sec + 3,507,221,220 cycles # 2.922 GHz + 6,966,297,573 instructions # 1.99 insn per cycle + 1.207630553 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2744) (512y: 1) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -172,15 +172,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.457119e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.069402e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.069402e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.302201e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.102178e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.102178e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086812e+00 +- 3.414242e-03 ) GeV^0 -TOTAL : 1.710783 sec - 3,531,936,310 cycles # 2.058 GHz - 6,791,295,224 instructions # 1.92 insn per cycle - 1.717620694 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3068) (512y: 0) (512z: 1969) +TOTAL : 1.522441 sec + 3,161,449,900 cycles # 2.070 GHz + 5,927,790,892 instructions # 1.88 insn per cycle + 1.534090530 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2811) (512y: 0) (512z: 1595) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt index 844b81d065..2eb3f90d4a 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt @@ -35,7 +35,7 @@ CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2023-07-20_21:07:28 +DATE: 2023-06-16_22:52:49 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -43,14 +43,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.003626e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.127484e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.261853e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.883163e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.153438e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.268351e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.563107 sec - 2,367,064,554 cycles # 2.929 GHz - 2,964,134,522 instructions # 1.25 insn per cycle - 1.302732461 seconds time elapsed +TOTAL : 0.546765 sec + 2,277,271,038 cycles # 2.894 GHz + 2,877,502,583 instructions # 1.26 insn per cycle + 0.845241312 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -68,15 +68,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.953429e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.016479e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.016479e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.947985e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.010292e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.010292e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 5.481935 sec - 16,950,369,980 cycles # 3.092 GHz - 45,672,159,539 instructions # 2.69 insn per cycle - 5.510655192 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 624) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 5.490788 sec + 16,975,144,546 cycles # 3.089 GHz + 45,688,141,777 instructions # 2.69 insn per cycle + 5.496744556 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 625) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -94,15 +94,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.262329e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.453354e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.453354e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.550845e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.778152e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.778152e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.323720 sec - 9,726,485,352 cycles # 2.922 GHz - 27,064,410,104 instructions # 2.78 insn per cycle - 3.418316609 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2578) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.061992 sec + 9,396,473,786 cycles # 3.065 GHz + 26,383,157,357 instructions # 2.81 insn per cycle + 3.078372883 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2530) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -120,15 +120,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.104299e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.806942e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.806942e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.169685e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.889662e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.889662e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.826293 sec - 5,351,707,652 cycles # 2.920 GHz - 11,506,713,483 instructions # 2.15 insn per cycle - 1.920329529 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2484) (512y: 0) (512z: 0) +TOTAL : 1.808628 sec + 5,256,027,628 cycles # 2.899 GHz + 11,191,741,528 instructions # 2.13 insn per cycle + 1.814736091 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2396) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -146,15 +146,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.670084e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.508458e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.508458e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.822007e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.696291e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.696291e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.683232 sec - 4,935,938,815 cycles # 2.926 GHz - 10,952,966,962 instructions # 2.22 insn per cycle - 1.737940844 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2280) (512y: 102) (512z: 0) +TOTAL : 1.643994 sec + 4,833,892,867 cycles # 2.931 GHz + 10,629,174,604 instructions # 2.20 insn per cycle + 1.660177459 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2214) (512y: 86) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -172,15 +172,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.295039e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.630366e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.630366e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.215185e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.538766e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.538766e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.568633 sec - 5,229,119,714 cycles # 2.045 GHz - 7,299,926,391 instructions # 1.40 insn per cycle - 2.604255850 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1376) (512y: 125) (512z: 1765) +TOTAL : 2.596408 sec + 5,235,856,295 cycles # 2.013 GHz + 6,967,015,570 instructions # 1.33 insn per cycle + 2.609031673 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1322) (512y: 98) (512z: 1681) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt index 4a1fbba2cf..2922f1f91d 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt @@ -35,7 +35,7 @@ CUDACPP_BUILDDIR='build.512z_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2023-07-20_21:07:57 +DATE: 2023-06-16_22:53:16 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -43,14 +43,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.016437e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.138803e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.276616e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.913817e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.163142e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.279616e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.557701 sec - 2,323,423,896 cycles # 2.890 GHz - 2,917,581,955 instructions # 1.26 insn per cycle - 1.297907313 seconds time elapsed +TOTAL : 0.546816 sec + 2,259,525,710 cycles # 2.872 GHz + 2,874,198,806 instructions # 1.27 insn per cycle + 0.845865418 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 208 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -68,14 +68,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.996251e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.062441e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.062441e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.980085e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.044880e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.044880e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 5.362165 sec - 16,557,574,473 cycles # 3.086 GHz - 44,663,669,622 instructions # 2.70 insn per cycle - 5.433713179 seconds time elapsed +TOTAL : 5.404304 sec + 16,542,848,858 cycles # 3.058 GHz + 44,663,688,353 instructions # 2.70 insn per cycle + 5.410750967 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 574) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/runTest.exe @@ -94,15 +94,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.596189e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.828910e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.828910e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.631536e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.869634e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.869634e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.023217 sec - 9,105,028,266 cycles # 3.006 GHz - 25,663,298,459 instructions # 2.82 insn per cycle - 3.071092535 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2397) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 2.995465 sec + 8,977,674,629 cycles # 2.992 GHz + 25,016,671,404 instructions # 2.79 insn per cycle + 3.009020680 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2371) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -120,15 +120,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.190953e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.682435e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.682435e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.387090e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.916862e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.916862e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.128128 sec - 6,201,802,464 cycles # 2.905 GHz - 13,056,963,081 instructions # 2.11 insn per cycle - 2.165397182 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2674) (512y: 0) (512z: 0) +TOTAL : 2.054054 sec + 6,000,307,213 cycles # 2.915 GHz + 12,337,042,112 instructions # 2.06 insn per cycle + 2.066748623 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2523) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -146,15 +146,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.472049e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.009929e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.009929e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.658921e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.249521e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.249521e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.024045 sec - 5,914,606,163 cycles # 2.914 GHz - 12,603,863,560 instructions # 2.13 insn per cycle - 2.315391858 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2372) (512y: 283) (512z: 0) +TOTAL : 1.961008 sec + 5,732,861,626 cycles # 2.917 GHz + 11,869,224,559 instructions # 2.07 insn per cycle + 1.973908984 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2246) (512y: 242) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -172,15 +172,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.110043e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.413583e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.413583e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.318569e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.652438e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.652438e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.659334 sec - 5,303,753,034 cycles # 1.991 GHz - 8,504,326,913 instructions # 1.60 insn per cycle - 3.010194704 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1264) (512y: 220) (512z: 1954) +TOTAL : 2.536785 sec + 5,170,015,216 cycles # 2.035 GHz + 7,935,725,128 instructions # 1.53 insn per cycle + 2.548762025 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1280) (512y: 203) (512z: 1763) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt index f2ae273943..b96e6cdf8c 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt @@ -35,7 +35,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2023-07-20_21:08:28 +DATE: 2023-07-20_18:06:14 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -43,14 +43,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcess Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.519546e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.054082e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.071381e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.468816e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.046764e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.063268e+07 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 0.516281 sec - 2,094,387,742 cycles # 2.897 GHz - 2,574,885,134 instructions # 1.23 insn per cycle - 0.883287678 seconds time elapsed +TOTAL : 0.520889 sec + 2,060,727,068 cycles # 2.850 GHz + 2,558,010,669 instructions # 1.24 insn per cycle + 1.168701441 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -59,14 +59,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcess Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.041977e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.319777e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.336612e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.042347e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.319874e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.336718e+07 ) sec^-1 MeanMatrixElemValue = ( 6.734461e+02 +- 4.775415e+02 ) GeV^-2 -TOTAL : 0.643800 sec - 2,618,181,529 cycles # 2.922 GHz - 3,451,375,679 instructions # 1.32 insn per cycle - 0.958969205 seconds time elapsed +TOTAL : 0.639897 sec + 2,616,864,850 cycles # 2.931 GHz + 3,523,429,079 instructions # 1.35 insn per cycle + 0.950772041 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 @@ -81,14 +81,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.650360e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.667066e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.667066e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.626033e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.642947e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.642947e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 6.219021 sec - 19,275,991,049 cycles # 3.103 GHz - 59,036,956,643 instructions # 3.06 insn per cycle - 6.258053064 seconds time elapsed +TOTAL : 6.273529 sec + 19,268,605,456 cycles # 3.073 GHz + 59,039,157,361 instructions # 3.06 insn per cycle + 6.314726015 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1186) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/runTest.exe @@ -107,14 +107,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.979174e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.038728e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.038728e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.949579e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.008121e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.008121e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 3.314295 sec - 10,290,353,060 cycles # 3.101 GHz - 30,832,154,049 instructions # 3.00 insn per cycle - 3.423374247 seconds time elapsed +TOTAL : 3.335342 sec + 10,301,307,635 cycles # 3.085 GHz + 30,832,568,808 instructions # 2.99 insn per cycle + 3.876744623 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 5195) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/runTest.exe @@ -133,14 +133,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.037602e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.063335e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.063335e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.027358e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.052549e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.052549e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.603208 sec - 4,679,778,133 cycles # 2.910 GHz - 10,978,708,572 instructions # 2.35 insn per cycle - 2.114253887 seconds time elapsed +TOTAL : 1.658042 sec + 4,681,643,115 cycles # 2.883 GHz + 10,978,882,028 instructions # 2.35 insn per cycle + 1.725257325 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4229) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/runTest.exe @@ -159,14 +159,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.159201e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.191092e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.191092e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.157144e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.188803e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.188803e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.436516 sec - 4,175,941,051 cycles # 2.899 GHz - 10,174,196,424 instructions # 2.44 insn per cycle - 1.691049996 seconds time elapsed +TOTAL : 1.439170 sec + 4,178,790,730 cycles # 2.895 GHz + 10,174,320,844 instructions # 2.43 insn per cycle + 1.738297291 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4030) (512y: 38) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/runTest.exe @@ -185,14 +185,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.983683e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.137884e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.137884e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.037410e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.191866e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.191866e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 2.096964 sec - 3,991,148,691 cycles # 1.916 GHz - 5,609,836,928 instructions # 1.41 insn per cycle - 2.302677535 seconds time elapsed +TOTAL : 2.064997 sec + 3,989,608,047 cycles # 1.929 GHz + 5,609,692,049 instructions # 1.41 insn per cycle + 2.693307962 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1153) (512y: 83) (512z: 3514) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt index da8e18e8a0..a475d7222b 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt @@ -35,7 +35,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2023-07-20_21:53:50 +DATE: 2023-06-16_23:20:42 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -47,14 +47,14 @@ WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.434624e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.163640e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.163640e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.496628e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.248177e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.248177e+06 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 0.518886 sec - 2,150,200,449 cycles # 2.871 GHz - 2,846,808,229 instructions # 1.32 insn per cycle - 0.805944104 seconds time elapsed +TOTAL : 0.521129 sec + 2,135,210,830 cycles # 2.889 GHz + 2,829,931,118 instructions # 1.33 insn per cycle + 0.798733192 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost @@ -71,14 +71,14 @@ WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublo Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.355060e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.390860e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.390860e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.419279e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.564570e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.564570e+06 ) sec^-1 MeanMatrixElemValue = ( 6.734461e+02 +- 4.775415e+02 ) GeV^-2 -TOTAL : 0.884765 sec - 3,377,739,699 cycles # 2.949 GHz - 4,987,103,448 instructions # 1.48 insn per cycle - 1.208280648 seconds time elapsed +TOTAL : 0.870649 sec + 3,346,648,541 cycles # 2.958 GHz + 4,943,935,079 instructions # 1.48 insn per cycle + 1.190299197 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 @@ -94,15 +94,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.613486e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.630167e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.630167e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.646125e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.663019e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.663019e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 6.300272 sec - 19,289,098,585 cycles # 3.061 GHz - 59,042,825,553 instructions # 3.06 insn per cycle - 6.305990250 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1186) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.222512 sec + 19,287,909,225 cycles # 3.099 GHz + 59,047,382,237 instructions # 3.06 insn per cycle + 6.228060526 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1187) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -121,15 +121,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.915877e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.974545e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.974545e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.998676e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.058773e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.058773e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 3.361962 sec - 10,318,588,423 cycles # 3.067 GHz - 30,878,194,298 instructions # 2.99 insn per cycle - 3.367119059 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 5195) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.306706 sec + 10,251,708,193 cycles # 3.096 GHz + 30,706,584,426 instructions # 3.00 insn per cycle + 3.312153580 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 5158) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -148,15 +148,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.027000e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.052462e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.052462e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.028551e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.054465e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.054465e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.624671 sec - 4,704,668,189 cycles # 2.888 GHz - 11,025,375,367 instructions # 2.34 insn per cycle - 1.630184567 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4229) (512y: 0) (512z: 0) +TOTAL : 1.624375 sec + 4,708,780,636 cycles # 2.891 GHz + 10,966,076,818 instructions # 2.33 insn per cycle + 1.629709328 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4166) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -175,15 +175,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.151558e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.183536e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.183536e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.164974e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.197651e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.197651e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.454334 sec - 4,217,743,495 cycles # 2.892 GHz - 10,226,711,519 instructions # 2.42 insn per cycle - 1.469528638 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4030) (512y: 38) (512z: 0) +TOTAL : 1.437942 sec + 4,180,610,973 cycles # 2.898 GHz + 10,160,732,858 instructions # 2.43 insn per cycle + 1.452787766 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3967) (512y: 32) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -202,15 +202,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.996826e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.152697e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.152697e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.237498e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.399499e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.399499e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 2.083094 sec - 4,020,342,215 cycles # 1.928 GHz - 5,649,061,861 instructions # 1.41 insn per cycle - 2.098694845 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1153) (512y: 83) (512z: 3514) +TOTAL : 2.022999 sec + 3,915,207,010 cycles # 1.933 GHz + 5,583,645,604 instructions # 1.43 insn per cycle + 2.033737208 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1125) (512y: 59) (512z: 3431) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt index ec5bf2be00..c7fa7a5874 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt @@ -35,7 +35,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2023-07-20_21:08:59 +DATE: 2023-06-16_22:54:11 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -43,14 +43,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcess Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.477153e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.044858e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.061356e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.406208e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.034522e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.051529e+07 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 0.492373 sec - 2,067,241,263 cycles # 2.879 GHz - 2,606,418,823 instructions # 1.26 insn per cycle - 0.880347197 seconds time elapsed +TOTAL : 0.495761 sec + 2,045,430,788 cycles # 2.844 GHz + 2,567,034,130 instructions # 1.26 insn per cycle + 0.778564031 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -59,14 +59,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcess Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.034762e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.308343e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.325107e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.078800e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.305420e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.318205e+07 ) sec^-1 MeanMatrixElemValue = ( 6.734461e+02 +- 4.775415e+02 ) GeV^-2 -TOTAL : 0.638571 sec - 2,605,073,519 cycles # 2.925 GHz - 3,457,864,112 instructions # 1.33 insn per cycle - 0.951841244 seconds time elapsed +TOTAL : 0.628664 sec + 2,557,626,902 cycles # 2.898 GHz + 3,428,444,966 instructions # 1.34 insn per cycle + 0.942725213 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 @@ -81,15 +81,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.606805e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.623317e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.623317e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.633830e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.650576e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.650576e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 6.308963 sec - 19,297,950,943 cycles # 3.057 GHz - 59,301,138,481 instructions # 3.07 insn per cycle - 6.439664494 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1311) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.246365 sec + 19,302,390,466 cycles # 3.090 GHz + 59,308,261,536 instructions # 3.07 insn per cycle + 6.251260293 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1309) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -107,15 +107,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.994867e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.054861e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.054861e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.048737e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.110388e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.110388e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 3.304045 sec - 10,200,591,825 cycles # 3.084 GHz - 30,475,881,621 instructions # 2.99 insn per cycle - 3.365130207 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 5026) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.268980 sec + 10,121,456,601 cycles # 3.094 GHz + 30,320,135,500 instructions # 3.00 insn per cycle + 3.274001858 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 5009) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -133,15 +133,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.874801e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.010882e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.010882e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.955459e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.019425e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.019425e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.688537 sec - 4,893,947,963 cycles # 2.899 GHz - 11,483,757,196 instructions # 2.35 insn per cycle - 1.821112683 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4449) (512y: 0) (512z: 0) +TOTAL : 1.669916 sec + 4,868,753,430 cycles # 2.909 GHz + 11,322,372,755 instructions # 2.33 insn per cycle + 1.675375031 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4330) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -159,15 +159,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.074686e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.102157e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.102157e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.093964e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.122647e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.122647e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.564437 sec - 4,504,298,652 cycles # 2.900 GHz - 10,737,737,403 instructions # 2.38 insn per cycle - 1.709835755 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4159) (512y: 213) (512z: 0) +TOTAL : 1.521327 sec + 4,446,518,647 cycles # 2.915 GHz + 10,548,968,003 instructions # 2.37 insn per cycle + 1.526307477 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4044) (512y: 186) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -185,15 +185,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.811645e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.959369e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.959369e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.202771e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.370511e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.370511e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 2.123727 sec - 4,030,815,885 cycles # 1.894 GHz - 5,897,954,529 instructions # 1.46 insn per cycle - 2.173920346 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1134) (512y: 134) (512z: 3617) +TOTAL : 2.023574 sec + 3,918,082,536 cycles # 1.933 GHz + 5,775,352,534 instructions # 1.47 insn per cycle + 2.028589705 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1089) (512y: 110) (512z: 3505) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt index 1c920cbcbb..421224af27 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt @@ -35,7 +35,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2023-07-20_21:09:29 +DATE: 2023-06-16_22:54:40 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -43,30 +43,30 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcess Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.382353e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.360508e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.473217e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.327176e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.257598e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.374537e+07 ) sec^-1 MeanMatrixElemValue = ( 1.008472e+02 +- 5.002447e+01 ) GeV^-2 -TOTAL : 0.536989 sec - 2,041,419,289 cycles # 2.882 GHz - 2,464,881,532 instructions # 1.21 insn per cycle - 1.049891645 seconds time elapsed +TOTAL : 0.473310 sec + 2,027,915,544 cycles # 2.867 GHz + 2,463,775,432 instructions # 1.21 insn per cycle + 0.764839697 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 254 +==PROF== Profiling "sigmaKin": launch__registers_per_thread 249 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.038824e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.384248e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.483729e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.391345e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.450691e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.517488e+07 ) sec^-1 MeanMatrixElemValue = ( 6.630099e+02 +- 4.770719e+02 ) GeV^-2 -TOTAL : 0.528037 sec - 2,198,421,232 cycles # 2.872 GHz - 2,766,761,223 instructions # 1.26 insn per cycle - 0.822729940 seconds time elapsed +TOTAL : 0.522081 sec + 2,217,811,440 cycles # 2.881 GHz + 2,788,557,835 instructions # 1.26 insn per cycle + 0.827734782 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 @@ -81,15 +81,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.696458e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.710879e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.710879e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.701153e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.715579e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.715579e+04 ) sec^-1 MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 -TOTAL : 6.103354 sec - 18,865,013,191 cycles # 3.091 GHz - 59,499,650,964 instructions # 3.15 insn per cycle - 6.119660662 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 966) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.095019 sec + 18,873,238,787 cycles # 3.098 GHz + 59,483,025,635 instructions # 3.15 insn per cycle + 6.100057991 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 970) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -107,15 +107,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.807513e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.965464e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.965464e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.884416e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.046889e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.046889e+04 ) sec^-1 MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 -TOTAL : 1.881053 sec - 5,782,993,113 cycles # 3.068 GHz - 16,747,026,154 instructions # 2.90 insn per cycle - 1.958193414 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 5903) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 1.866765 sec + 5,710,913,630 cycles # 3.054 GHz + 16,521,962,319 instructions # 2.89 insn per cycle + 1.876809148 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 5863) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -133,15 +133,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.019853e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.100807e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.100807e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.049407e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.136391e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.136391e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008858e+02 +- 5.002467e+01 ) GeV^-2 -TOTAL : 0.832383 sec - 2,439,090,349 cycles # 2.915 GHz - 5,829,457,385 instructions # 2.39 insn per cycle - 0.903057197 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4470) (512y: 0) (512z: 0) +TOTAL : 0.820791 sec + 2,394,884,327 cycles # 2.906 GHz + 5,781,261,143 instructions # 2.41 insn per cycle + 0.825770188 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4396) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -159,15 +159,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.234345e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.333168e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.333168e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.285711e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.389224e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.389224e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008858e+02 +- 5.002467e+01 ) GeV^-2 -TOTAL : 0.754601 sec - 2,195,211,246 cycles # 2.892 GHz - 5,387,743,778 instructions # 2.45 insn per cycle - 0.810969553 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4228) (512y: 31) (512z: 0) +TOTAL : 0.738019 sec + 2,162,282,356 cycles # 2.914 GHz + 5,351,223,903 instructions # 2.47 insn per cycle + 0.743188108 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4167) (512y: 25) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -185,15 +185,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.630602e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.683020e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.683020e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.684448e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.741867e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.741867e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008856e+02 +- 5.002468e+01 ) GeV^-2 -TOTAL : 1.030720 sec - 2,013,294,421 cycles # 1.947 GHz - 3,059,211,039 instructions # 1.52 insn per cycle - 1.103631095 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1457) (512y: 48) (512z: 3643) +TOTAL : 0.995951 sec + 1,960,707,531 cycles # 1.961 GHz + 3,020,418,486 instructions # 1.54 insn per cycle + 1.001177596 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1416) (512y: 33) (512z: 3549) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt index 057a75b359..cc4c5da246 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt @@ -35,7 +35,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2023-07-20_21:54:19 +DATE: 2023-06-16_23:21:10 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -47,20 +47,20 @@ WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.933755e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.140104e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.140104e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.989407e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.193414e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.193414e+07 ) sec^-1 MeanMatrixElemValue = ( 1.009071e+02 +- 5.002295e+01 ) GeV^-2 -TOTAL : 0.481785 sec - 2,020,675,796 cycles # 2.859 GHz - 2,620,444,997 instructions # 1.30 insn per cycle - 0.763592428 seconds time elapsed +TOTAL : 0.482951 sec + 2,013,685,658 cycles # 2.866 GHz + 2,592,200,009 instructions # 1.29 insn per cycle + 0.760014140 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -==PROF== Profiling "sigmaKin": launch__registers_per_thread 254 +==PROF== Profiling "sigmaKin": launch__registers_per_thread 249 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge OMP= @@ -71,14 +71,14 @@ WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublo Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.680674e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.598484e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.598484e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.724425e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.653929e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.653929e+07 ) sec^-1 MeanMatrixElemValue = ( 6.737489e+02 +- 4.776370e+02 ) GeV^-2 -TOTAL : 0.665313 sec - 2,657,310,035 cycles # 2.933 GHz - 3,690,103,488 instructions # 1.39 insn per cycle - 0.963957639 seconds time elapsed +TOTAL : 0.663195 sec + 2,648,120,389 cycles # 2.929 GHz + 3,654,507,605 instructions # 1.38 insn per cycle + 0.962099488 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 @@ -94,15 +94,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.693417e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.708029e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.708029e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.700458e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.715017e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.715017e+04 ) sec^-1 MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 -TOTAL : 6.107922 sec - 18,890,378,561 cycles # 3.091 GHz - 59,503,042,285 instructions # 3.15 insn per cycle - 6.113277148 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 966) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.092117 sec + 18,886,470,035 cycles # 3.099 GHz + 59,487,095,856 instructions # 3.15 insn per cycle + 6.096869282 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 970) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -121,15 +121,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.778332e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.936095e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.936095e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.823507e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.985125e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.985125e+04 ) sec^-1 MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 -TOTAL : 1.891937 sec - 5,803,966,395 cycles # 3.063 GHz - 16,794,649,273 instructions # 2.89 insn per cycle - 1.902772182 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 5903) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 1.882730 sec + 5,728,750,294 cycles # 3.037 GHz + 16,570,856,852 instructions # 2.89 insn per cycle + 1.893324475 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 5863) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -148,15 +148,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.998089e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.078666e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.078666e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.034411e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.118355e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.118355e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008858e+02 +- 5.002467e+01 ) GeV^-2 -TOTAL : 0.845348 sec - 2,454,719,062 cycles # 2.893 GHz - 5,865,953,165 instructions # 2.39 insn per cycle - 0.850202176 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4470) (512y: 0) (512z: 0) +TOTAL : 0.830571 sec + 2,411,689,968 cycles # 2.891 GHz + 5,818,447,773 instructions # 2.41 insn per cycle + 0.835666053 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4396) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -175,15 +175,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.237373e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.336875e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.336875e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.276703e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.381223e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.381223e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008858e+02 +- 5.002467e+01 ) GeV^-2 -TOTAL : 0.757604 sec - 2,212,054,855 cycles # 2.906 GHz - 5,424,221,011 instructions # 2.45 insn per cycle - 0.762618144 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4228) (512y: 31) (512z: 0) +TOTAL : 0.750194 sec + 2,182,144,900 cycles # 2.906 GHz + 5,388,939,388 instructions # 2.47 insn per cycle + 0.755052239 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4167) (512y: 25) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -202,15 +202,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.633225e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.686814e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.686814e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.697532e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.759499e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.759499e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008856e+02 +- 5.002468e+01 ) GeV^-2 -TOTAL : 1.030791 sec - 2,030,333,689 cycles # 1.963 GHz - 3,099,966,605 instructions # 1.53 insn per cycle - 1.035715973 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1457) (512y: 48) (512z: 3643) +TOTAL : 0.992848 sec + 1,971,737,093 cycles # 1.979 GHz + 3,062,300,048 instructions # 1.55 insn per cycle + 0.997634050 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1416) (512y: 33) (512z: 3549) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt index 2cf8d53c7d..adf6bfc552 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt @@ -35,7 +35,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2023-07-20_21:09:54 +DATE: 2023-06-16_22:55:04 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -43,14 +43,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcess Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.376525e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.343871e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.457151e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.339749e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.271976e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.390547e+07 ) sec^-1 MeanMatrixElemValue = ( 1.008472e+02 +- 5.002447e+01 ) GeV^-2 -TOTAL : 0.473036 sec - 2,007,699,883 cycles # 2.880 GHz - 2,459,130,138 instructions # 1.22 insn per cycle - 0.898616844 seconds time elapsed +TOTAL : 0.474512 sec + 1,999,862,138 cycles # 2.850 GHz + 2,440,400,706 instructions # 1.22 insn per cycle + 0.759138369 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 248 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -59,14 +59,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcess Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.070069e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.444462e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.545408e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.328864e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.325114e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.388368e+07 ) sec^-1 MeanMatrixElemValue = ( 6.630099e+02 +- 4.770719e+02 ) GeV^-2 -TOTAL : 0.525604 sec - 2,203,148,996 cycles # 2.898 GHz - 2,760,518,874 instructions # 1.25 insn per cycle - 0.819879637 seconds time elapsed +TOTAL : 0.521920 sec + 2,232,068,560 cycles # 2.897 GHz + 2,825,671,567 instructions # 1.27 insn per cycle + 0.828183210 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 @@ -81,15 +81,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.678576e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.693156e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.693156e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.699454e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.714152e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.714152e+04 ) sec^-1 MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 -TOTAL : 6.141699 sec - 18,822,482,037 cycles # 3.064 GHz - 59,247,836,614 instructions # 3.15 insn per cycle - 6.261288628 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1032) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.092288 sec + 18,813,707,143 cycles # 3.087 GHz + 59,245,916,196 instructions # 3.15 insn per cycle + 6.097217914 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1031) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -107,15 +107,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.054834e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.224078e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.224078e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.395747e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.576744e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.576744e+04 ) sec^-1 MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 -TOTAL : 1.839270 sec - 5,509,016,555 cycles # 3.004 GHz - 16,490,998,648 instructions # 2.99 insn per cycle - 2.053719623 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 5660) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 1.764298 sec + 5,437,573,561 cycles # 3.075 GHz + 16,318,666,941 instructions # 3.00 insn per cycle + 1.774906001 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 5638) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -133,15 +133,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.664534e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.720262e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.720262e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.771528e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.833956e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.833956e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008858e+02 +- 5.002467e+01 ) GeV^-2 -TOTAL : 1.005968 sec - 2,893,060,756 cycles # 2.866 GHz - 6,514,013,063 instructions # 2.25 insn per cycle - 1.242459063 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 5291) (512y: 0) (512z: 0) +TOTAL : 0.946284 sec + 2,773,608,102 cycles # 2.919 GHz + 6,345,516,906 instructions # 2.29 insn per cycle + 0.951331609 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 5044) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -159,15 +159,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.824406e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.890372e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.890372e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.898207e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.970096e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.970096e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008858e+02 +- 5.002467e+01 ) GeV^-2 -TOTAL : 0.919386 sec - 2,671,830,518 cycles # 2.892 GHz - 6,055,708,502 instructions # 2.27 insn per cycle - 0.985746679 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 5027) (512y: 24) (512z: 0) +TOTAL : 0.884490 sec + 2,572,468,156 cycles # 2.896 GHz + 5,899,227,322 instructions # 2.29 insn per cycle + 0.889316146 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4834) (512y: 18) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -185,15 +185,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.482915e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.527317e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.527317e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.504005e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.551206e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.551206e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008856e+02 +- 5.002468e+01 ) GeV^-2 -TOTAL : 1.128000 sec - 2,194,111,929 cycles # 1.938 GHz - 3,430,841,073 instructions # 1.56 insn per cycle - 1.302571935 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1818) (512y: 32) (512z: 3943) +TOTAL : 1.114511 sec + 2,105,248,250 cycles # 1.885 GHz + 3,318,671,370 instructions # 1.58 insn per cycle + 1.119425864 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1702) (512y: 31) (512z: 3743) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt index 78ad3dd5a3..509a151f9e 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt @@ -35,7 +35,7 @@ CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2023-07-20_21:10:20 +DATE: 2023-06-16_22:55:28 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -43,14 +43,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcess Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.487976e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.049895e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.066504e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.484913e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.046842e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.063089e+07 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 0.487461 sec - 2,052,126,557 cycles # 2.880 GHz - 2,555,699,886 instructions # 1.25 insn per cycle - 1.238381497 seconds time elapsed +TOTAL : 0.491268 sec + 2,111,124,427 cycles # 2.897 GHz + 2,572,779,858 instructions # 1.22 insn per cycle + 0.786655156 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -59,14 +59,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcess Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.037409e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.312760e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.329726e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.087869e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.318151e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.331093e+07 ) sec^-1 MeanMatrixElemValue = ( 6.734461e+02 +- 4.775415e+02 ) GeV^-2 -TOTAL : 0.652928 sec - 2,585,961,293 cycles # 2.828 GHz - 3,428,136,482 instructions # 1.33 insn per cycle - 0.984443304 seconds time elapsed +TOTAL : 0.625996 sec + 2,572,645,106 cycles # 2.943 GHz + 3,495,847,613 instructions # 1.36 insn per cycle + 0.932270031 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 @@ -81,15 +81,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.605870e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.622140e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.622140e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.611473e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.627816e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.627816e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 6.311902 sec - 19,594,132,061 cycles # 3.103 GHz - 60,123,866,538 instructions # 3.07 insn per cycle - 6.327868840 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1221) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.298063 sec + 19,584,640,453 cycles # 3.108 GHz + 60,128,609,602 instructions # 3.07 insn per cycle + 6.303425917 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1222) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -107,15 +107,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.751325e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.805662e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.805662e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.848908e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.905341e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.905341e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 3.472244 sec - 10,164,881,632 cycles # 2.924 GHz - 30,576,476,559 instructions # 3.01 insn per cycle - 4.082121932 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 5330) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.403128 sec + 10,110,840,700 cycles # 2.969 GHz + 30,404,987,403 instructions # 3.01 insn per cycle + 3.408194423 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 5293) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -133,15 +133,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.036163e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.062520e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.062520e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.045472e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.071998e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.071998e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.611082 sec - 4,639,764,079 cycles # 2.878 GHz - 10,937,195,374 instructions # 2.36 insn per cycle - 1.973908866 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4366) (512y: 0) (512z: 0) +TOTAL : 1.590737 sec + 4,630,470,943 cycles # 2.904 GHz + 10,870,600,906 instructions # 2.35 insn per cycle + 1.595619026 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4303) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -159,15 +159,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.176766e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.209556e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.209556e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.186438e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.219581e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.219581e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.416392 sec - 4,122,187,271 cycles # 2.901 GHz - 10,126,420,661 instructions # 2.46 insn per cycle - 1.711920249 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4168) (512y: 30) (512z: 0) +TOTAL : 1.404412 sec + 4,081,268,261 cycles # 2.899 GHz + 10,057,889,430 instructions # 2.46 insn per cycle + 1.409450424 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4105) (512y: 24) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -185,15 +185,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.858861e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.005020e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.005020e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.045191e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.199654e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.199654e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 2.110003 sec - 4,095,540,862 cycles # 1.937 GHz - 5,816,620,021 instructions # 1.42 insn per cycle - 2.456940004 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1715) (512y: 105) (512z: 3589) +TOTAL : 2.069671 sec + 3,996,882,474 cycles # 1.933 GHz + 5,753,481,101 instructions # 1.44 insn per cycle + 2.074584748 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1687) (512y: 81) (512z: 3506) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt index d92847fb01..ccf71ae338 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt @@ -35,7 +35,7 @@ CUDACPP_BUILDDIR='build.512z_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2023-07-20_21:10:53 +DATE: 2023-06-16_22:55:56 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -43,14 +43,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcess Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.401131e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.033928e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.050226e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.431385e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.039088e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.054905e+07 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 0.515171 sec - 2,056,839,909 cycles # 2.869 GHz - 2,576,409,865 instructions # 1.25 insn per cycle - 0.855628342 seconds time elapsed +TOTAL : 0.491602 sec + 2,104,753,968 cycles # 2.908 GHz + 2,609,636,072 instructions # 1.24 insn per cycle + 0.781468608 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -59,14 +59,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcess Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.028999e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.299299e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.315701e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.077122e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.304284e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.317280e+07 ) sec^-1 MeanMatrixElemValue = ( 6.734461e+02 +- 4.775415e+02 ) GeV^-2 -TOTAL : 0.641939 sec - 2,582,700,068 cycles # 2.889 GHz - 3,436,847,447 instructions # 1.33 insn per cycle - 0.953160137 seconds time elapsed +TOTAL : 0.629159 sec + 2,584,689,149 cycles # 2.926 GHz + 3,485,232,370 instructions # 1.35 insn per cycle + 0.945332117 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 @@ -81,15 +81,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.571032e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.586969e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.586969e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.592801e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.608815e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.608815e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 6.396622 sec - 19,664,379,705 cycles # 3.073 GHz - 60,331,559,061 instructions # 3.07 insn per cycle - 6.434916087 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1268) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.344781 sec + 19,563,789,540 cycles # 3.082 GHz + 60,327,298,777 instructions # 3.08 insn per cycle + 6.350004821 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1269) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -107,15 +107,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.890716e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.948108e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.948108e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.907260e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.965434e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.965434e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 3.373977 sec - 10,040,824,901 cycles # 2.973 GHz - 30,239,593,978 instructions # 3.01 insn per cycle - 3.464373247 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 5139) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.362361 sec + 9,992,104,885 cycles # 2.970 GHz + 30,065,057,144 instructions # 3.01 insn per cycle + 3.367365912 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 5113) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -133,15 +133,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.834933e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.006917e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.006917e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.851728e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.008826e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.008826e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.690312 sec - 4,912,013,496 cycles # 2.899 GHz - 11,447,609,412 instructions # 2.33 insn per cycle - 1.724526183 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4548) (512y: 0) (512z: 0) +TOTAL : 1.687332 sec + 4,858,147,798 cycles # 2.873 GHz + 11,292,265,821 instructions # 2.32 insn per cycle + 1.692863069 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4448) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -159,15 +159,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.076603e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.103795e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.103795e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.079486e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.107998e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.107998e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.545997 sec - 4,484,782,690 cycles # 2.892 GHz - 10,702,375,097 instructions # 2.39 insn per cycle - 1.599401132 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4267) (512y: 206) (512z: 0) +TOTAL : 1.542428 sec + 4,416,137,343 cycles # 2.857 GHz + 10,507,517,652 instructions # 2.38 insn per cycle + 1.556019289 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4154) (512y: 177) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -185,15 +185,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.762522e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.907159e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.907159e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.082610e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.236339e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.236339e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 2.136868 sec - 4,133,592,990 cycles # 1.930 GHz - 6,063,856,587 instructions # 1.47 insn per cycle - 2.228543893 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1655) (512y: 156) (512z: 3670) +TOTAL : 2.052702 sec + 4,019,238,803 cycles # 1.954 GHz + 5,946,674,764 instructions # 1.48 insn per cycle + 2.063384894 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1626) (512y: 130) (512z: 3560) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt index 8c2c94b018..a50b715c33 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt @@ -35,7 +35,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2023-07-20_21:11:24 +DATE: 2023-07-20_18:06:49 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -43,14 +43,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.466993e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.500632e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.503033e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.499883e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.534958e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.537448e+05 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.570415 sec - 2,293,155,939 cycles # 2.889 GHz - 3,219,671,716 instructions # 1.40 insn per cycle - 1.117807895 seconds time elapsed +TOTAL : 0.552571 sec + 2,257,370,545 cycles # 2.850 GHz + 3,137,503,968 instructions # 1.39 insn per cycle + 1.248572500 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -59,14 +59,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.136906e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.178530e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.180213e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.137393e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.178973e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.180705e+05 ) sec^-1 MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.084506 sec - 10,117,923,148 cycles # 3.017 GHz - 22,054,840,982 instructions # 2.18 insn per cycle - 3.411397362 seconds time elapsed +TOTAL : 3.080628 sec + 10,129,897,113 cycles # 3.025 GHz + 22,134,397,165 instructions # 2.19 insn per cycle + 3.408190094 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 @@ -81,14 +81,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.993029e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.994282e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.994282e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.983681e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.984882e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.984882e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 8.270012 sec - 25,458,466,562 cycles # 3.089 GHz - 78,729,339,695 instructions # 3.09 insn per cycle - 8.306316859 seconds time elapsed +TOTAL : 8.278998 sec + 25,450,157,586 cycles # 3.073 GHz + 78,728,446,191 instructions # 3.09 insn per cycle + 8.608822725 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 4806) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe @@ -107,14 +107,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.684255e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.688337e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.688337e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.677551e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.681895e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.681895e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.466462 sec - 12,933,120,057 cycles # 2.895 GHz - 39,279,906,325 instructions # 3.04 insn per cycle - 4.550851378 seconds time elapsed +TOTAL : 4.470387 sec + 12,940,039,913 cycles # 2.892 GHz + 39,280,217,925 instructions # 3.04 insn per cycle + 4.509840354 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:13137) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe @@ -133,14 +133,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.605962e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.627284e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.627284e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.546375e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.568102e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.568102e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.918579 sec - 5,586,159,827 cycles # 2.906 GHz - 13,842,042,791 instructions # 2.48 insn per cycle - 3.077526328 seconds time elapsed +TOTAL : 2.010586 sec + 5,578,390,601 cycles # 2.884 GHz + 13,842,207,020 instructions # 2.48 insn per cycle + 2.052872131 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:11036) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe @@ -159,14 +159,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.657716e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.686504e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.686504e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.716089e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.745050e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.745050e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.834664 sec - 4,920,899,242 cycles # 2.871 GHz - 12,489,660,406 instructions # 2.54 insn per cycle - 1.893952324 seconds time elapsed +TOTAL : 1.698476 sec + 4,920,449,719 cycles # 2.889 GHz + 12,487,679,239 instructions # 2.54 insn per cycle + 1.754922184 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:10733) (512y: 35) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest.exe @@ -185,14 +185,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.716046e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.733843e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.733843e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.629515e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.646834e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.646834e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.204284 sec - 4,106,709,085 cycles # 1.918 GHz - 6,375,326,773 instructions # 1.55 insn per cycle - 2.274483223 seconds time elapsed +TOTAL : 2.160398 sec + 4,108,673,871 cycles # 1.898 GHz + 6,375,517,693 instructions # 1.55 insn per cycle + 2.320528146 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1418) (512y: 90) (512z:10050) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt index 0adee0a1f4..f6b028fd57 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt @@ -35,7 +35,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2023-07-20_21:55:16 +DATE: 2023-06-16_23:22:07 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -47,14 +47,14 @@ WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.080439e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.473250e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.473250e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.072253e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.474976e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.474976e+05 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.537814 sec - 2,278,470,184 cycles # 2.900 GHz - 3,109,915,299 instructions # 1.36 insn per cycle - 0.845306718 seconds time elapsed +TOTAL : 0.540694 sec + 2,277,802,180 cycles # 2.911 GHz + 3,167,166,199 instructions # 1.39 insn per cycle + 0.842923456 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost @@ -71,14 +71,14 @@ WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublo Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.565740e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.131447e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.131447e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.563283e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.119139e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.119139e+05 ) sec^-1 MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.380159 sec - 11,125,978,229 cycles # 3.036 GHz - 24,809,779,899 instructions # 2.23 insn per cycle - 3.732643736 seconds time elapsed +TOTAL : 3.373561 sec + 11,106,851,106 cycles # 3.040 GHz + 22,871,021,639 instructions # 2.06 insn per cycle + 3.711939491 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 @@ -94,15 +94,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.998785e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.000076e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.000076e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.000691e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.001917e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.001917e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 8.220308 sec - 25,447,174,300 cycles # 3.096 GHz - 78,731,688,122 instructions # 3.09 insn per cycle - 8.225970073 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 4806) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 8.213921 sec + 25,358,245,526 cycles # 3.087 GHz + 78,737,811,439 instructions # 3.11 insn per cycle + 8.219246769 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 4807) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -121,15 +121,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.685117e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.689224e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.689224e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.672231e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.676605e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.676605e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.466411 sec - 12,956,140,528 cycles # 2.899 GHz - 39,296,620,608 instructions # 3.03 insn per cycle - 4.472076509 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:13137) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.487084 sec + 13,003,690,708 cycles # 2.899 GHz + 39,263,649,353 instructions # 3.02 insn per cycle + 4.492556901 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:13100) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -148,15 +148,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.625714e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.648804e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.648804e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.646145e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.670478e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.670478e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.916468 sec - 5,590,852,994 cycles # 2.911 GHz - 13,852,000,546 instructions # 2.48 insn per cycle - 1.922141811 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11036) (512y: 0) (512z: 0) +TOTAL : 1.912940 sec + 5,540,461,476 cycles # 2.890 GHz + 13,840,907,410 instructions # 2.50 insn per cycle + 1.918355435 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10973) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -175,15 +175,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.712316e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.743029e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.743029e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.799344e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.831037e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.831037e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.704021 sec - 4,936,107,878 cycles # 2.890 GHz - 12,499,839,209 instructions # 2.53 insn per cycle - 1.709837330 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10733) (512y: 35) (512z: 0) +TOTAL : 1.689800 sec + 4,910,066,879 cycles # 2.898 GHz + 12,486,034,623 instructions # 2.54 insn per cycle + 1.695175100 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10670) (512y: 29) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -202,15 +202,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.648797e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.668243e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.668243e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.596903e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.616542e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.616542e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.158432 sec - 4,115,261,751 cycles # 1.903 GHz - 6,383,914,529 instructions # 1.55 insn per cycle - 2.163849769 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1418) (512y: 90) (512z:10050) +TOTAL : 2.177343 sec + 4,099,033,272 cycles # 1.880 GHz + 6,376,304,361 instructions # 1.56 insn per cycle + 2.182949238 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1390) (512y: 66) (512z: 9967) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt index 485c59b453..92695858f2 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt @@ -35,7 +35,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2023-07-20_22:06:11 +DATE: 2023-06-16_23:33:00 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -43,14 +43,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.502760e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.532378e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.534768e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.499222e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.525335e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.527608e+05 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 0.532269 sec - 2,211,969,761 cycles # 2.882 GHz - 3,054,590,152 instructions # 1.38 insn per cycle - 0.828075385 seconds time elapsed +TOTAL : 0.532774 sec + 2,217,572,873 cycles # 2.863 GHz + 3,094,369,354 instructions # 1.40 insn per cycle + 0.834887472 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --common ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -59,14 +59,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.127366e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.161557e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.162949e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.135027e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.167071e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.168390e+05 ) sec^-1 MeanMatrixElemValue = ( 1.252232e+02 +- 1.234346e+02 ) GeV^-4 -TOTAL : 3.162297 sec - 10,442,103,236 cycles # 3.050 GHz - 23,082,903,351 instructions # 2.21 insn per cycle - 3.481732929 seconds time elapsed +TOTAL : 3.158999 sec + 10,380,525,451 cycles # 3.035 GHz + 21,782,548,801 instructions # 2.10 insn per cycle + 3.477627040 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 @@ -81,15 +81,15 @@ Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.994299e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.995601e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.995601e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.007018e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.008318e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.008318e+03 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 8.236835 sec - 25,423,465,418 cycles # 3.085 GHz - 78,728,442,107 instructions # 3.10 insn per cycle - 8.241870936 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 4806) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 8.184199 sec + 25,337,258,294 cycles # 3.096 GHz + 78,731,832,013 instructions # 3.11 insn per cycle + 8.189083155 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 4807) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -107,15 +107,15 @@ Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.678359e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.682937e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.682937e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.689365e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.693788e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.693788e+03 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 4.470974 sec - 12,988,490,230 cycles # 2.903 GHz - 39,280,708,038 instructions # 3.02 insn per cycle - 4.475878825 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:13137) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.457849 sec + 12,962,355,153 cycles # 2.906 GHz + 39,244,481,809 instructions # 3.03 insn per cycle + 4.462633637 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:13100) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -133,15 +133,15 @@ Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.559408e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.581989e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.581989e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.621598e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.647376e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.647376e+03 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 1.928699 sec - 5,595,414,622 cycles # 2.896 GHz - 13,842,472,635 instructions # 2.47 insn per cycle - 1.934105016 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11036) (512y: 0) (512z: 0) +TOTAL : 1.914532 sec + 5,534,212,685 cycles # 2.886 GHz + 13,825,715,526 instructions # 2.50 insn per cycle + 1.920377680 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10973) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -159,15 +159,15 @@ Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.726526e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.757424e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.757424e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.758885e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.789531e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.789531e+03 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 1.698519 sec - 4,925,895,228 cycles # 2.894 GHz - 12,485,231,969 instructions # 2.53 insn per cycle - 1.703665533 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10733) (512y: 35) (512z: 0) +TOTAL : 1.692773 sec + 4,894,078,000 cycles # 2.886 GHz + 12,469,587,925 instructions # 2.55 insn per cycle + 1.697826655 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10670) (512y: 29) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -185,15 +185,15 @@ Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.650054e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.669278e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.669278e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.808849e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.828810e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.828810e+03 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 2.156624 sec - 4,110,868,658 cycles # 1.903 GHz - 6,372,725,132 instructions # 1.55 insn per cycle - 2.161817806 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1418) (512y: 90) (512z:10050) +TOTAL : 2.112703 sec + 4,076,094,887 cycles # 1.926 GHz + 6,358,678,796 instructions # 1.56 insn per cycle + 2.117704881 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1390) (512y: 66) (512z: 9967) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_curhst.txt index fc68c8abb2..0fd3a41abe 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_curhst.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_curhst.txt @@ -35,7 +35,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2023-07-20_22:03:08 +DATE: 2023-06-16_23:29:57 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -43,14 +43,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.492001e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.521006e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.523249e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.491741e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.517860e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.520006e+05 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.532223 sec - 2,199,679,818 cycles # 2.865 GHz - 3,078,639,611 instructions # 1.40 insn per cycle - 0.828478944 seconds time elapsed +TOTAL : 0.531429 sec + 2,232,387,650 cycles # 2.917 GHz + 3,085,215,012 instructions # 1.38 insn per cycle + 0.827733891 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --curhst ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -59,14 +59,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.132204e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.166549e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.167974e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.139492e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.171728e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.173076e+05 ) sec^-1 MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.104277 sec - 10,292,067,722 cycles # 3.047 GHz - 23,130,792,971 instructions # 2.25 insn per cycle - 3.433901986 seconds time elapsed +TOTAL : 3.104917 sec + 10,133,121,889 cycles # 3.000 GHz + 23,296,776,590 instructions # 2.30 insn per cycle + 3.434357003 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 @@ -81,15 +81,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.989716e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.990926e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.990926e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.009220e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.010486e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.010486e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 8.254254 sec - 25,458,380,390 cycles # 3.083 GHz - 78,726,548,173 instructions # 3.09 insn per cycle - 8.259716276 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 4806) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 8.175111 sec + 25,293,924,967 cycles # 3.094 GHz + 78,729,597,256 instructions # 3.11 insn per cycle + 8.179990802 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 4807) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -107,15 +107,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.680759e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.685062e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.685062e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.697722e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.701921e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.701921e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.466276 sec - 12,940,500,806 cycles # 2.896 GHz - 39,279,248,367 instructions # 3.04 insn per cycle - 4.471292279 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:13137) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.446354 sec + 12,955,336,489 cycles # 2.913 GHz + 39,244,878,043 instructions # 3.03 insn per cycle + 4.451190139 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:13100) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -133,15 +133,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.594855e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.618427e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.618427e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.711957e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.735812e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.735812e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.919409 sec - 5,571,102,600 cycles # 2.897 GHz - 13,842,912,759 instructions # 2.48 insn per cycle - 1.924361435 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11036) (512y: 0) (512z: 0) +TOTAL : 1.893317 sec + 5,534,953,514 cycles # 2.918 GHz + 13,828,055,257 instructions # 2.50 insn per cycle + 1.898185319 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10973) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -159,15 +159,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.710001e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.740045e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.740045e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.783880e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.814289e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.814289e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.700361 sec - 4,921,841,309 cycles # 2.888 GHz - 12,488,785,402 instructions # 2.54 insn per cycle - 1.705334481 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10733) (512y: 35) (512z: 0) +TOTAL : 1.686497 sec + 4,890,139,976 cycles # 2.893 GHz + 12,471,298,231 instructions # 2.55 insn per cycle + 1.691636381 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10670) (512y: 29) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -185,15 +185,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.730881e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.750510e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.750510e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.791142e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.810133e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.810133e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.132664 sec - 4,104,858,621 cycles # 1.922 GHz - 6,376,566,948 instructions # 1.55 insn per cycle - 2.137609218 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1418) (512y: 90) (512z:10050) +TOTAL : 2.116611 sec + 4,073,012,903 cycles # 1.922 GHz + 6,360,730,501 instructions # 1.56 insn per cycle + 2.121591447 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1390) (512y: 66) (512z: 9967) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt index 6f251d3b56..a4f7f78bc7 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt @@ -35,7 +35,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2023-07-20_22:00:09 +DATE: 2023-06-16_23:26:59 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -44,14 +44,14 @@ WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.136728e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.527717e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.530093e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.163660e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.517064e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.519281e+05 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.537775 sec - 2,208,542,402 cycles # 2.858 GHz - 3,063,548,109 instructions # 1.39 insn per cycle - 0.833202849 seconds time elapsed +TOTAL : 0.535969 sec + 2,257,409,350 cycles # 2.913 GHz + 3,148,870,456 instructions # 1.39 insn per cycle + 0.835585235 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --rmbhst WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -62,14 +62,14 @@ WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.631459e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.149473e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.150870e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.637869e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.154701e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.156020e+05 ) sec^-1 MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.269698 sec - 10,718,322,872 cycles # 3.035 GHz - 23,523,848,145 instructions # 2.19 insn per cycle - 3.589302594 seconds time elapsed +TOTAL : 3.270916 sec + 10,746,580,301 cycles # 3.041 GHz + 21,720,295,313 instructions # 2.02 insn per cycle + 3.590840027 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 @@ -84,15 +84,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.991882e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.993056e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.993056e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.007137e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.008416e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.008416e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 8.244533 sec - 25,439,061,241 cycles # 3.084 GHz - 78,727,259,014 instructions # 3.09 insn per cycle - 8.249993439 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 4806) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 8.182065 sec + 25,337,840,293 cycles # 3.096 GHz + 78,726,782,076 instructions # 3.11 insn per cycle + 8.187223661 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 4807) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -110,15 +110,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.691320e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.695813e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.695813e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.676580e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.680802e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.680802e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.453699 sec - 12,938,246,649 cycles # 2.904 GHz - 39,281,420,124 instructions # 3.04 insn per cycle - 4.458774291 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:13137) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.471527 sec + 12,958,437,919 cycles # 2.896 GHz + 39,245,010,758 instructions # 3.03 insn per cycle + 4.476404809 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:13100) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -136,15 +136,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.570423e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.594260e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.594260e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.510079e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.532022e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.532022e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.924843 sec - 5,567,069,229 cycles # 2.888 GHz - 13,843,422,538 instructions # 2.49 insn per cycle - 1.929753617 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11036) (512y: 0) (512z: 0) +TOTAL : 1.938058 sec + 5,528,373,338 cycles # 2.847 GHz + 13,827,020,883 instructions # 2.50 insn per cycle + 1.943269358 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10973) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -162,15 +162,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.731903e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.762230e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.762230e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.746577e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.776421e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.776421e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.695828 sec - 4,914,223,683 cycles # 2.892 GHz - 12,487,131,526 instructions # 2.54 insn per cycle - 1.701100063 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10733) (512y: 35) (512z: 0) +TOTAL : 1.693197 sec + 4,886,719,743 cycles # 2.881 GHz + 12,471,368,394 instructions # 2.55 insn per cycle + 1.698247712 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10670) (512y: 29) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -188,15 +188,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.679319e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.698620e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.698620e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.653939e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.671885e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.671885e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.146952 sec - 4,103,248,100 cycles # 1.908 GHz - 6,376,541,949 instructions # 1.55 insn per cycle - 2.151889123 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1418) (512y: 90) (512z:10050) +TOTAL : 2.156844 sec + 4,112,325,475 cycles # 1.906 GHz + 6,362,849,586 instructions # 1.55 insn per cycle + 2.161761478 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1390) (512y: 66) (512z: 9967) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt index 9abf1667b9..f6164c5cd9 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt @@ -35,7 +35,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2023-07-20_21:12:03 +DATE: 2023-06-16_22:57:00 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -43,14 +43,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.480278e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.513831e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.516616e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.505261e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.540833e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.543305e+05 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.550049 sec - 2,287,526,264 cycles # 2.897 GHz - 3,138,093,773 instructions # 1.37 insn per cycle - 0.919156378 seconds time elapsed +TOTAL : 0.552999 sec + 2,261,605,886 cycles # 2.864 GHz + 3,081,160,015 instructions # 1.36 insn per cycle + 0.848098050 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -59,14 +59,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.135453e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.177068e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.178746e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.144541e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.178169e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.179509e+05 ) sec^-1 MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.080317 sec - 10,165,972,215 cycles # 3.035 GHz - 20,552,643,405 instructions # 2.02 insn per cycle - 3.406574604 seconds time elapsed +TOTAL : 3.060795 sec + 10,085,184,977 cycles # 3.025 GHz + 22,493,985,254 instructions # 2.23 insn per cycle + 3.393105734 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 @@ -81,14 +81,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.007223e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.008429e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.008429e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.940758e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.941897e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.941897e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 8.193853 sec - 25,202,772,526 cycles # 3.079 GHz - 78,472,855,946 instructions # 3.11 insn per cycle - 8.251001899 seconds time elapsed +TOTAL : 8.462419 sec + 25,229,958,045 cycles # 2.982 GHz + 78,471,131,282 instructions # 3.11 insn per cycle + 8.467976850 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 4138) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/runTest.exe @@ -107,15 +107,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.670079e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.674107e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.674107e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.727132e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.731470e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.731470e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.483975 sec - 12,953,152,194 cycles # 2.890 GHz - 39,220,450,902 instructions # 3.03 insn per cycle - 4.693256291 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:12915) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.410452 sec + 12,894,155,265 cycles # 2.921 GHz + 39,184,570,946 instructions # 3.04 insn per cycle + 4.415732132 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:12872) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -133,15 +133,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.595958e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.617953e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.617953e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.691843e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.716729e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.716729e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.934158 sec - 5,587,053,203 cycles # 2.905 GHz - 13,963,012,003 instructions # 2.50 insn per cycle - 1.979768787 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11215) (512y: 0) (512z: 0) +TOTAL : 1.898008 sec + 5,540,010,361 cycles # 2.913 GHz + 13,917,256,443 instructions # 2.51 insn per cycle + 1.903063407 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11079) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -159,15 +159,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.674049e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.701552e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.701552e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.729941e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.760433e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.760433e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.709060 sec - 4,950,193,267 cycles # 2.894 GHz - 12,613,767,195 instructions # 2.55 insn per cycle - 1.823511106 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10799) (512y: 209) (512z: 0) +TOTAL : 1.696750 sec + 4,936,264,808 cycles # 2.903 GHz + 12,569,278,770 instructions # 2.55 insn per cycle + 1.701843841 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10689) (512y: 180) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -185,15 +185,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.747035e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.765473e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.765473e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.790703e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.810947e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.810947e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.155290 sec - 4,109,795,781 cycles # 1.927 GHz - 6,499,229,387 instructions # 1.58 insn per cycle - 2.397424307 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1337) (512y: 197) (512z:10115) +TOTAL : 2.116619 sec + 4,079,630,480 cycles # 1.924 GHz + 6,456,227,558 instructions # 1.58 insn per cycle + 2.121736786 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1301) (512y: 170) (512z:10055) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt index 5e5d7bc31a..19fe3b6889 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt @@ -35,7 +35,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2023-07-20_21:44:42 +DATE: 2023-06-16_23:12:02 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -43,14 +43,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.241431e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.271489e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.273591e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.228389e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.252132e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.253939e+05 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.557294 sec - 2,292,963,127 cycles # 2.888 GHz - 3,239,169,663 instructions # 1.41 insn per cycle - 0.851628454 seconds time elapsed +TOTAL : 0.557644 sec + 2,313,651,554 cycles # 2.916 GHz + 3,231,773,253 instructions # 1.40 insn per cycle + 0.852166017 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/gcheck.exe -p 64 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -59,14 +59,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.763382e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.791930e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.793084e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.762297e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.789061e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.790098e+05 ) sec^-1 MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.350348 sec - 11,007,504,759 cycles # 3.048 GHz - 24,886,809,994 instructions # 2.26 insn per cycle - 3.670454422 seconds time elapsed +TOTAL : 3.347281 sec + 10,991,558,985 cycles # 3.045 GHz + 24,428,456,620 instructions # 2.22 insn per cycle + 3.667854463 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/fgcheck.exe 2 64 2 @@ -81,15 +81,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.446494e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.447098e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.447098e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.470440e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.471090e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.471090e+02 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 36.904533 sec - 113,382,852,435 cycles # 3.073 GHz - 145,134,639,869 instructions # 1.28 insn per cycle - 36.909888176 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:21732) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 36.698448 sec + 113,650,760,924 cycles # 3.097 GHz + 145,145,803,355 instructions # 1.28 insn per cycle + 36.703372680 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:21749) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -97,8 +97,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198140450E-004 -Relative difference = 2.83729918072716e-07 +Avg ME (F77/C++) = 6.6266731198140439E-004 +Relative difference = 2.8372991823632784e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/check.exe -p 64 256 1 OMP= @@ -107,15 +107,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.362499e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.365955e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.365955e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.344679e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.348354e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.348354e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.888268 sec - 14,677,993,677 cycles # 3.001 GHz - 37,472,914,626 instructions # 2.55 insn per cycle - 4.893680127 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:67987) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.914072 sec + 14,660,206,602 cycles # 2.982 GHz + 37,434,709,642 instructions # 2.55 insn per cycle + 4.919337364 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:67993) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -133,15 +133,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.830537e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.849306e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.849306e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.882298e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.901243e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.901243e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.105721 sec - 6,096,759,703 cycles # 2.892 GHz - 12,990,862,793 instructions # 2.13 insn per cycle - 2.110664037 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:46628) (512y: 0) (512z: 0) +TOTAL : 2.091187 sec + 6,050,971,713 cycles # 2.889 GHz + 12,923,420,886 instructions # 2.14 insn per cycle + 2.096398672 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:46338) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -159,15 +159,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.449842e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.479067e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.479067e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.423590e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.450741e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.450741e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.746964 sec - 5,050,262,369 cycles # 2.885 GHz - 11,389,962,191 instructions # 2.26 insn per cycle - 1.752063111 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:40204) (512y: 253) (512z: 0) +TOTAL : 1.751970 sec + 5,013,534,941 cycles # 2.856 GHz + 11,332,565,506 instructions # 2.26 insn per cycle + 1.757089218 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:40036) (512y: 188) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -185,15 +185,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.942178e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.962084e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.962084e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.963573e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.983731e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.983731e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.076047 sec - 3,950,091,453 cycles # 1.900 GHz - 5,859,649,851 instructions # 1.48 insn per cycle - 2.081424994 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2068) (512y: 349) (512z:39168) +TOTAL : 2.070157 sec + 3,940,063,125 cycles # 1.900 GHz + 5,797,816,506 instructions # 1.47 insn per cycle + 2.075319073 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1924) (512y: 317) (512z:38936) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt index 879829cf08..5e839cdef4 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt @@ -35,7 +35,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2023-07-20_21:45:50 +DATE: 2023-06-16_23:13:09 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -43,14 +43,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.235844e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.264598e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.266853e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.263284e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.287600e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.289479e+05 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.558143 sec - 2,280,770,187 cycles # 2.875 GHz - 3,178,097,484 instructions # 1.39 insn per cycle - 0.851367813 seconds time elapsed +TOTAL : 0.554858 sec + 2,295,580,731 cycles # 2.899 GHz + 3,230,493,146 instructions # 1.41 insn per cycle + 0.849020118 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/gcheck.exe -p 64 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -59,14 +59,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.791964e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.820467e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.821641e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.794287e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.821112e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.822179e+05 ) sec^-1 MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.308185 sec - 10,862,107,398 cycles # 3.040 GHz - 25,240,172,650 instructions # 2.32 insn per cycle - 3.629398764 seconds time elapsed +TOTAL : 3.303147 sec + 10,830,294,927 cycles # 3.034 GHz + 23,889,615,566 instructions # 2.21 insn per cycle + 3.626334184 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/fgcheck.exe 2 64 2 @@ -81,15 +81,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.424384e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.425028e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.425028e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.442294e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.442931e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.442931e+02 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 37.088440 sec - 114,211,394,089 cycles # 3.080 GHz - 145,714,818,946 instructions # 1.28 insn per cycle - 37.094031678 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:22564) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 36.931797 sec + 114,071,883,560 cycles # 3.089 GHz + 145,758,435,006 instructions # 1.28 insn per cycle + 36.937096359 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:22580) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -107,15 +107,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.312433e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.315949e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.315949e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.286130e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.289440e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.289440e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.962323 sec - 15,003,469,170 cycles # 3.023 GHz - 37,760,342,381 instructions # 2.52 insn per cycle - 4.967441688 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:68503) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 5.001270 sec + 14,944,293,116 cycles # 2.986 GHz + 37,593,481,677 instructions # 2.52 insn per cycle + 5.006579543 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:68265) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -133,15 +133,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.999242e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.020149e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.020149e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.014357e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.033871e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.033871e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.060967 sec - 5,961,641,103 cycles # 2.888 GHz - 12,864,856,616 instructions # 2.16 insn per cycle - 2.066277526 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:45815) (512y: 0) (512z: 0) +TOTAL : 2.057405 sec + 5,959,840,539 cycles # 2.891 GHz + 12,824,533,137 instructions # 2.15 insn per cycle + 2.062870435 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:45687) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -159,15 +159,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.529149e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.559685e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.559685e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.500841e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.528151e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.528151e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.732437 sec - 5,048,694,956 cycles # 2.908 GHz - 11,388,914,097 instructions # 2.26 insn per cycle - 1.737591058 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:39935) (512y: 191) (512z: 0) +TOTAL : 1.743665 sec + 5,035,330,638 cycles # 2.888 GHz + 11,343,590,938 instructions # 2.25 insn per cycle + 1.748819840 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:39850) (512y: 138) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -185,15 +185,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.964676e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.984660e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.984660e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.021482e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.042248e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.042248e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.070016 sec - 3,942,624,504 cycles # 1.901 GHz - 5,832,014,355 instructions # 1.48 insn per cycle - 2.075300065 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1661) (512y: 281) (512z:38823) +TOTAL : 2.055496 sec + 3,937,439,973 cycles # 1.912 GHz + 5,773,024,249 instructions # 1.47 insn per cycle + 2.060744461 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1589) (512y: 251) (512z:38642) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt index cc9e16824b..0b7bc69d3c 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt @@ -35,7 +35,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2023-07-20_21:12:42 +DATE: 2023-06-16_22:57:36 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -43,14 +43,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.293805e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.354264e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.360829e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.285069e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.329665e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.335265e+05 ) sec^-1 MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 -TOTAL : 0.506137 sec - 2,123,609,565 cycles # 2.901 GHz - 2,768,376,421 instructions # 1.30 insn per cycle - 0.891121799 seconds time elapsed +TOTAL : 0.508351 sec + 2,077,856,827 cycles # 2.833 GHz + 2,725,078,892 instructions # 1.31 insn per cycle + 0.790826956 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -59,14 +59,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.485785e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.574453e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.578350e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.539433e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.615009e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.618092e+05 ) sec^-1 MeanMatrixElemValue = ( 6.664703e+00 +- 5.072736e+00 ) GeV^-4 -TOTAL : 1.762237 sec - 6,007,225,091 cycles # 2.995 GHz - 12,433,256,847 instructions # 2.07 insn per cycle - 2.063232934 seconds time elapsed +TOTAL : 1.761585 sec + 5,789,825,690 cycles # 2.887 GHz + 11,967,255,873 instructions # 2.07 insn per cycle + 2.065101803 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 @@ -81,15 +81,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.086460e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.087518e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.087518e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.965658e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.966651e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.966651e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 -TOTAL : 7.872665 sec - 24,454,726,323 cycles # 3.106 GHz - 78,145,561,925 instructions # 3.20 insn per cycle - 7.894452090 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 3559) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 8.353642 sec + 24,461,782,795 cycles # 2.928 GHz + 78,146,447,581 instructions # 3.19 insn per cycle + 8.359034707 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 3563) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -107,15 +107,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.574440e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.589416e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.589416e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.592457e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.607446e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.607446e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 -TOTAL : 2.174447 sec - 6,325,563,132 cycles # 2.903 GHz - 20,143,419,791 instructions # 3.18 insn per cycle - 2.249810920 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:13795) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 2.176903 sec + 6,304,553,002 cycles # 2.897 GHz + 20,090,800,645 instructions # 3.19 insn per cycle + 2.181656558 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:13755) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -133,15 +133,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.720363e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.727621e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.727621e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.706358e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.713844e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.713844e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.995468 sec - 2,807,396,977 cycles # 2.903 GHz - 7,046,192,972 instructions # 2.51 insn per cycle - 1.103665765 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11331) (512y: 0) (512z: 0) +TOTAL : 0.970457 sec + 2,815,185,653 cycles # 2.890 GHz + 7,033,969,834 instructions # 2.50 insn per cycle + 0.975199568 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11257) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -159,15 +159,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.937476e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.946846e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.946846e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.944671e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.953973e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.953973e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.855567 sec - 2,480,464,482 cycles # 2.884 GHz - 6,340,839,753 instructions # 2.56 insn per cycle - 0.914644333 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10976) (512y: 38) (512z: 0) +TOTAL : 0.852685 sec + 2,474,785,553 cycles # 2.891 GHz + 6,331,079,508 instructions # 2.56 insn per cycle + 0.857424788 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10915) (512y: 32) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -185,15 +185,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.563099e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.569046e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.569046e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.497348e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.503099e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.503099e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 -TOTAL : 1.058574 sec - 2,055,799,002 cycles # 1.935 GHz - 3,256,018,715 instructions # 1.58 insn per cycle - 1.127459406 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1723) (512y: 55) (512z:10179) +TOTAL : 1.105154 sec + 2,042,054,263 cycles # 1.843 GHz + 3,246,636,997 instructions # 1.59 insn per cycle + 1.110173718 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1682) (512y: 40) (512z:10085) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt index c772d72fef..f56715ce11 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt @@ -35,7 +35,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2023-07-20_21:55:53 +DATE: 2023-06-16_23:22:43 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -47,14 +47,14 @@ WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.623137e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.295470e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.295470e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.645163e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.314203e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.314203e+05 ) sec^-1 MeanMatrixElemValue = ( 4.048178e+00 +- 2.364571e+00 ) GeV^-4 -TOTAL : 0.491570 sec - 2,032,092,423 cycles # 2.858 GHz - 2,706,449,576 instructions # 1.33 insn per cycle - 0.768943384 seconds time elapsed +TOTAL : 0.495037 sec + 2,044,288,346 cycles # 2.860 GHz + 2,702,609,067 instructions # 1.32 insn per cycle + 0.772230608 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost @@ -71,14 +71,14 @@ WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublo Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.186058e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.457399e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.457399e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.239966e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.484650e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.484650e+05 ) sec^-1 MeanMatrixElemValue = ( 6.641709e+00 +- 4.994249e+00 ) GeV^-4 -TOTAL : 1.932434 sec - 6,587,419,441 cycles # 3.023 GHz - 13,509,593,797 instructions # 2.05 insn per cycle - 2.236088305 seconds time elapsed +TOTAL : 1.924795 sec + 6,601,631,800 cycles # 3.040 GHz + 13,123,912,596 instructions # 1.99 insn per cycle + 2.228980169 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 @@ -94,15 +94,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.067141e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.068220e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.068220e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.043992e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.045065e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.045065e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 -TOTAL : 7.946929 sec - 24,463,849,105 cycles # 3.077 GHz - 78,150,331,872 instructions # 3.19 insn per cycle - 7.952390320 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 3559) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 8.036711 sec + 24,503,345,133 cycles # 3.048 GHz + 78,146,893,496 instructions # 3.19 insn per cycle + 8.041734648 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 3563) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -121,15 +121,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.561934e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.577856e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.577856e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.476480e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.491910e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.491910e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 -TOTAL : 2.186281 sec - 6,339,521,314 cycles # 2.901 GHz - 20,153,968,676 instructions # 3.18 insn per cycle - 2.191168530 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:13795) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 2.205721 sec + 6,317,208,814 cycles # 2.860 GHz + 20,099,654,048 instructions # 3.18 insn per cycle + 2.210968010 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:13755) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -148,15 +148,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.707430e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.714936e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.714936e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.696185e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.703554e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.703554e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.972158 sec - 2,818,505,074 cycles # 2.888 GHz - 7,055,591,439 instructions # 2.50 insn per cycle - 0.977387429 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11331) (512y: 0) (512z: 0) +TOTAL : 0.978597 sec + 2,817,888,819 cycles # 2.869 GHz + 7,044,186,426 instructions # 2.50 insn per cycle + 0.983797034 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11257) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -175,15 +175,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.948667e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.958767e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.958767e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.943265e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.953322e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.953322e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.853216 sec - 2,487,341,355 cycles # 2.902 GHz - 6,349,964,082 instructions # 2.55 insn per cycle - 0.858063641 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10976) (512y: 38) (512z: 0) +TOTAL : 0.855535 sec + 2,477,342,849 cycles # 2.883 GHz + 6,340,941,160 instructions # 2.56 insn per cycle + 0.860485836 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10915) (512y: 32) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -202,15 +202,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.540296e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.546612e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.546612e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.553687e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.559969e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.559969e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 -TOTAL : 1.076911 sec - 2,064,199,000 cycles # 1.909 GHz - 3,265,783,017 instructions # 1.58 insn per cycle - 1.082355161 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1723) (512y: 55) (512z:10179) +TOTAL : 1.067416 sec + 2,044,224,175 cycles # 1.909 GHz + 3,257,025,834 instructions # 1.59 insn per cycle + 1.072360210 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1682) (512y: 40) (512z:10085) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt index 1b0aedbb15..f758b117fb 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt @@ -35,7 +35,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2023-07-20_22:06:48 +DATE: 2023-06-16_23:33:36 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -43,14 +43,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.297523e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.349455e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.355012e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.328620e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.382342e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.387489e+05 ) sec^-1 MeanMatrixElemValue = ( 4.159397e-01 +- 3.238804e-01 ) GeV^-4 -TOTAL : 0.489460 sec - 2,031,983,136 cycles # 2.866 GHz - 2,660,315,302 instructions # 1.31 insn per cycle - 0.766707793 seconds time elapsed +TOTAL : 0.490468 sec + 2,053,312,645 cycles # 2.896 GHz + 2,688,522,220 instructions # 1.31 insn per cycle + 0.768623655 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --common ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -59,14 +59,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.535927e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.609671e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.612844e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.549643e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.618843e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.621847e+05 ) sec^-1 MeanMatrixElemValue = ( 1.094367e+02 +- 1.071509e+02 ) GeV^-4 -TOTAL : 1.825308 sec - 6,246,908,367 cycles # 3.015 GHz - 13,101,638,870 instructions # 2.10 insn per cycle - 2.129563906 seconds time elapsed +TOTAL : 1.827390 sec + 6,222,758,821 cycles # 3.019 GHz + 12,654,178,829 instructions # 2.03 insn per cycle + 2.131020283 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 @@ -81,15 +81,15 @@ Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.055085e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.056196e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.056196e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.065245e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.066355e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.066355e+03 ) sec^-1 MeanMatrixElemValue = ( 4.208459e-01 +- 3.253446e-01 ) GeV^-4 -TOTAL : 7.991464 sec - 24,454,538,852 cycles # 3.060 GHz - 78,147,100,596 instructions # 3.20 insn per cycle - 7.996181444 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 3559) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 7.951720 sec + 24,455,995,614 cycles # 3.075 GHz + 78,143,504,623 instructions # 3.20 insn per cycle + 7.956774266 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 3563) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -107,15 +107,15 @@ Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.555366e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.569744e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.569744e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.603380e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.618762e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.618762e+03 ) sec^-1 MeanMatrixElemValue = ( 4.208457e-01 +- 3.253445e-01 ) GeV^-4 -TOTAL : 2.187402 sec - 6,332,019,786 cycles # 2.896 GHz - 20,143,927,605 instructions # 3.18 insn per cycle - 2.192547740 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:13795) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 2.174453 sec + 6,308,707,547 cycles # 2.902 GHz + 20,091,203,739 instructions # 3.18 insn per cycle + 2.179216274 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:13755) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -133,15 +133,15 @@ Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.693061e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.700655e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.700655e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.709770e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.717220e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.717220e+04 ) sec^-1 MeanMatrixElemValue = ( 4.214980e-01 +- 3.255523e-01 ) GeV^-4 -TOTAL : 0.979470 sec - 2,816,494,037 cycles # 2.865 GHz - 7,045,042,216 instructions # 2.50 insn per cycle - 0.984502588 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11331) (512y: 0) (512z: 0) +TOTAL : 0.969715 sec + 2,816,258,030 cycles # 2.894 GHz + 7,033,592,275 instructions # 2.50 insn per cycle + 0.974263363 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11257) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -159,15 +159,15 @@ Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.940268e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.950149e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.950149e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.920768e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.930607e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.930607e+04 ) sec^-1 MeanMatrixElemValue = ( 4.214980e-01 +- 3.255523e-01 ) GeV^-4 -TOTAL : 0.855887 sec - 2,483,849,645 cycles # 2.889 GHz - 6,338,462,252 instructions # 2.55 insn per cycle - 0.860901875 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10976) (512y: 38) (512z: 0) +TOTAL : 0.864540 sec + 2,472,809,203 cycles # 2.851 GHz + 6,329,435,201 instructions # 2.56 insn per cycle + 0.869035951 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10915) (512y: 32) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -185,15 +185,15 @@ Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.494187e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.499795e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.499795e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.537958e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.544260e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.544260e+04 ) sec^-1 MeanMatrixElemValue = ( 4.214982e-01 +- 3.255524e-01 ) GeV^-4 -TOTAL : 1.108220 sec - 2,058,736,293 cycles # 1.852 GHz - 3,253,847,950 instructions # 1.58 insn per cycle - 1.113291081 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1723) (512y: 55) (512z:10179) +TOTAL : 1.077016 sec + 2,042,333,122 cycles # 1.891 GHz + 3,245,054,769 instructions # 1.59 insn per cycle + 1.081439983 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1682) (512y: 40) (512z:10085) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_curhst.txt index 566c62eb94..34a43cea0a 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_curhst.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_curhst.txt @@ -35,7 +35,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2023-07-20_22:03:44 +DATE: 2023-06-16_23:30:33 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -43,14 +43,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.301966e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.354598e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.360275e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.323907e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.378161e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.383334e+05 ) sec^-1 MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 -TOTAL : 0.486889 sec - 2,022,665,316 cycles # 2.868 GHz - 2,704,291,304 instructions # 1.34 insn per cycle - 0.762871358 seconds time elapsed +TOTAL : 0.487701 sec + 2,049,188,758 cycles # 2.905 GHz + 2,686,122,587 instructions # 1.31 insn per cycle + 0.764918156 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --curhst ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -59,14 +59,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.534023e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.607834e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.611291e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.548245e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.617363e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.620322e+05 ) sec^-1 MeanMatrixElemValue = ( 6.664703e+00 +- 5.072736e+00 ) GeV^-4 -TOTAL : 1.780286 sec - 6,046,697,232 cycles # 3.000 GHz - 11,899,923,154 instructions # 1.97 insn per cycle - 2.072594848 seconds time elapsed +TOTAL : 1.774012 sec + 6,089,970,636 cycles # 3.032 GHz + 11,896,059,014 instructions # 1.95 insn per cycle + 2.066787545 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 @@ -81,15 +81,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.073048e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.074203e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.074203e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.064063e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.065180e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.065180e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 -TOTAL : 7.926970 sec - 24,451,668,721 cycles # 3.086 GHz - 78,146,846,349 instructions # 3.20 insn per cycle - 7.932022696 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 3559) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 7.960763 sec + 24,464,649,749 cycles # 3.074 GHz + 78,145,698,591 instructions # 3.19 insn per cycle + 7.965669739 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 3563) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -107,15 +107,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.475860e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.490402e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.490402e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.640949e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.656696e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.656696e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 -TOTAL : 2.203097 sec - 6,329,607,517 cycles # 2.869 GHz - 20,142,844,528 instructions # 3.18 insn per cycle - 2.208391848 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:13795) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 2.155286 sec + 6,306,014,277 cycles # 2.921 GHz + 20,090,085,412 instructions # 3.19 insn per cycle + 2.160045070 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:13755) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -133,15 +133,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.699267e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.706674e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.706674e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.710584e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.718001e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.718001e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.974252 sec - 2,809,370,444 cycles # 2.872 GHz - 7,045,407,709 instructions # 2.51 insn per cycle - 0.979220213 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11331) (512y: 0) (512z: 0) +TOTAL : 0.967745 sec + 2,810,549,219 cycles # 2.896 GHz + 7,034,117,328 instructions # 2.50 insn per cycle + 0.972224772 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11257) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -159,15 +159,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.942768e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.952832e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.952832e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.945349e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.955042e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.955042e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.853132 sec - 2,478,092,843 cycles # 2.891 GHz - 6,339,887,872 instructions # 2.56 insn per cycle - 0.858130173 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10976) (512y: 38) (512z: 0) +TOTAL : 0.852055 sec + 2,468,399,437 cycles # 2.884 GHz + 6,331,090,642 instructions # 2.56 insn per cycle + 0.856970624 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10915) (512y: 32) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -185,15 +185,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.527903e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.534014e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.534014e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.543083e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.549272e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.549272e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 -TOTAL : 1.082826 sec - 2,057,380,382 cycles # 1.894 GHz - 3,255,450,758 instructions # 1.58 insn per cycle - 1.087867277 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1723) (512y: 55) (512z:10179) +TOTAL : 1.072075 sec + 2,041,169,827 cycles # 1.898 GHz + 3,247,254,694 instructions # 1.59 insn per cycle + 1.077160788 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1682) (512y: 40) (512z:10085) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt index 6f63d207b0..513c5a26b1 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt @@ -35,7 +35,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2023-07-20_22:00:45 +DATE: 2023-06-16_23:27:35 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -44,14 +44,14 @@ WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.713444e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.390922e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.396204e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.708420e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.347094e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.352007e+05 ) sec^-1 MeanMatrixElemValue = ( 4.048178e+00 +- 2.364571e+00 ) GeV^-4 -TOTAL : 0.491684 sec - 2,020,030,319 cycles # 2.851 GHz - 2,689,896,301 instructions # 1.33 insn per cycle - 0.767605249 seconds time elapsed +TOTAL : 0.490976 sec + 2,048,446,840 cycles # 2.889 GHz + 2,674,620,255 instructions # 1.31 insn per cycle + 0.767937575 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --rmbhst WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -62,14 +62,14 @@ WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.423044e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.609072e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.612225e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.421166e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.615921e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.619172e+05 ) sec^-1 MeanMatrixElemValue = ( 6.641709e+00 +- 4.994249e+00 ) GeV^-4 -TOTAL : 1.858693 sec - 6,292,992,231 cycles # 3.003 GHz - 12,556,350,716 instructions # 2.00 insn per cycle - 2.155054713 seconds time elapsed +TOTAL : 1.859235 sec + 6,236,141,578 cycles # 2.977 GHz + 12,723,940,591 instructions # 2.04 insn per cycle + 2.154378517 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 @@ -84,15 +84,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.077200e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.078264e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.078264e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.071302e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.072413e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.072413e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 -TOTAL : 7.904744 sec - 24,467,791,241 cycles # 3.094 GHz - 78,145,764,275 instructions # 3.19 insn per cycle - 7.909909533 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 3559) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 7.929179 sec + 24,461,908,171 cycles # 3.086 GHz + 78,144,542,835 instructions # 3.19 insn per cycle + 7.934172762 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 3563) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -110,15 +110,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.522059e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.537026e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.537026e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.571029e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.586100e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.586100e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 -TOTAL : 2.190461 sec - 6,328,587,463 cycles # 2.885 GHz - 20,142,983,986 instructions # 3.18 insn per cycle - 2.195619595 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:13795) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 2.174925 sec + 6,301,926,717 cycles # 2.893 GHz + 20,090,235,453 instructions # 3.19 insn per cycle + 2.179846514 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:13755) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -136,15 +136,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.660326e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.667477e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.667477e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.701876e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.709090e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.709090e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.997218 sec - 2,815,411,232 cycles # 2.814 GHz - 7,045,663,201 instructions # 2.50 insn per cycle - 1.002032906 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11331) (512y: 0) (512z: 0) +TOTAL : 0.972776 sec + 2,805,674,310 cycles # 2.875 GHz + 7,034,136,735 instructions # 2.51 insn per cycle + 0.977241135 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11257) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -162,15 +162,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.919602e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.929135e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.929135e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.937974e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.948232e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.948232e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.863326 sec - 2,486,107,627 cycles # 2.868 GHz - 6,340,695,682 instructions # 2.55 insn per cycle - 0.868336247 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10976) (512y: 38) (512z: 0) +TOTAL : 0.855178 sec + 2,469,042,209 cycles # 2.877 GHz + 6,331,047,669 instructions # 2.56 insn per cycle + 0.859915572 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10915) (512y: 32) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -188,15 +188,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.529029e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.535036e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.535036e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.553740e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.559951e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.559951e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 -TOTAL : 1.081671 sec - 2,054,874,126 cycles # 1.893 GHz - 3,255,263,728 instructions # 1.58 insn per cycle - 1.086733556 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1723) (512y: 55) (512z:10179) +TOTAL : 1.064606 sec + 2,037,786,255 cycles # 1.909 GHz + 3,246,591,583 instructions # 1.59 insn per cycle + 1.069170450 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1682) (512y: 40) (512z:10085) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt index 5d201a938c..24986b526d 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt @@ -35,7 +35,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2023-07-20_21:13:13 +DATE: 2023-06-16_22:58:06 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -43,14 +43,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.237992e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.297166e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.303201e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.278810e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.338048e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.343904e+05 ) sec^-1 MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 -TOTAL : 0.505362 sec - 2,121,365,697 cycles # 2.908 GHz - 2,731,260,498 instructions # 1.29 insn per cycle - 1.341157810 seconds time elapsed +TOTAL : 0.508627 sec + 2,112,782,804 cycles # 2.878 GHz + 2,766,936,523 instructions # 1.31 insn per cycle + 0.791513495 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -59,14 +59,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.505857e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.595207e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.598955e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.568420e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.630751e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.634459e+05 ) sec^-1 MeanMatrixElemValue = ( 6.664703e+00 +- 5.072736e+00 ) GeV^-4 -TOTAL : 1.763444 sec - 5,995,680,430 cycles # 2.990 GHz - 12,384,648,771 instructions # 2.07 insn per cycle - 2.064561057 seconds time elapsed +TOTAL : 1.749439 sec + 6,009,356,386 cycles # 3.024 GHz + 12,104,468,348 instructions # 2.01 insn per cycle + 2.046648448 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 @@ -81,15 +81,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.082483e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.083552e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.083552e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.095981e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.097105e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.097105e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 -TOTAL : 7.897607 sec - 24,348,810,982 cycles # 3.087 GHz - 77,889,626,452 instructions # 3.20 insn per cycle - 7.943554318 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 3067) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 7.835166 sec + 24,337,509,935 cycles # 3.105 GHz + 77,896,565,911 instructions # 3.20 insn per cycle + 7.840097785 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 3071) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -107,15 +107,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.608213e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.624241e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.624241e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.647482e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.663046e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.663046e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 -TOTAL : 2.164785 sec - 6,282,670,265 cycles # 2.897 GHz - 20,099,869,281 instructions # 3.20 insn per cycle - 2.597267083 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:13493) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 2.153739 sec + 6,249,075,693 cycles # 2.898 GHz + 20,045,520,110 instructions # 3.21 insn per cycle + 2.158442622 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:13454) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -133,15 +133,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.653008e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.659734e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.659734e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.690515e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.698040e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.698040e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 1.001126 sec - 2,891,158,299 cycles # 2.876 GHz - 7,185,506,651 instructions # 2.49 insn per cycle - 1.058195658 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11939) (512y: 0) (512z: 0) +TOTAL : 0.979063 sec + 2,866,637,980 cycles # 2.916 GHz + 7,146,127,186 instructions # 2.49 insn per cycle + 0.984190288 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11820) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -149,8 +149,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627263e-04 -Avg ME (F77/C++) = 6.6272629326369348E-004 -Relative difference = 1.0164537758030486e-08 +Avg ME (F77/C++) = 6.6272629297356445E-004 +Relative difference = 1.0602318832827381e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/check.exe -p 64 256 1 OMP= @@ -159,15 +159,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.852946e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.861845e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.861845e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.866768e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.875905e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.875905e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.894324 sec - 2,587,043,360 cycles # 2.879 GHz - 6,482,351,629 instructions # 2.51 insn per cycle - 1.699234319 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11614) (512y: 31) (512z: 0) +TOTAL : 0.887507 sec + 2,573,527,962 cycles # 2.887 GHz + 6,441,681,549 instructions # 2.50 insn per cycle + 0.892352069 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11516) (512y: 24) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -175,8 +175,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627263e-04 -Avg ME (F77/C++) = 6.6272629326369348E-004 -Relative difference = 1.0164537758030486e-08 +Avg ME (F77/C++) = 6.6272629297356445E-004 +Relative difference = 1.0602318832827381e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/check.exe -p 64 256 1 OMP= @@ -185,15 +185,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.496811e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.502352e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.502352e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.528254e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.534227e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.534227e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 -TOTAL : 1.104710 sec - 2,126,238,786 cycles # 1.917 GHz - 3,412,832,290 instructions # 1.61 insn per cycle - 1.148968403 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2380) (512y: 27) (512z:10218) +TOTAL : 1.082245 sec + 2,091,652,812 cycles # 1.925 GHz + 3,367,447,588 instructions # 1.61 insn per cycle + 1.087515058 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2221) (512y: 32) (512z:10146) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -201,8 +201,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627264e-04 -Avg ME (F77/C++) = 6.6272638322350391E-004 -Relative difference = 2.5314362155245923e-08 +Avg ME (F77/C++) = 6.6272641155893514E-004 +Relative difference = 1.7441488862010707e-08 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt index d16ae8bb12..2114debcba 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt @@ -35,7 +35,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2023-07-20_21:46:58 +DATE: 2023-06-16_23:14:17 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -43,14 +43,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.497960e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.545296e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.550208e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.564508e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.602710e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.606907e+05 ) sec^-1 MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 -TOTAL : 0.514967 sec - 2,174,852,353 cycles # 2.913 GHz - 2,825,633,830 instructions # 1.30 insn per cycle - 0.806507599 seconds time elapsed +TOTAL : 0.511467 sec + 2,169,852,681 cycles # 2.909 GHz + 2,868,849,541 instructions # 1.32 insn per cycle + 0.803186385 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/gcheck.exe -p 64 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -59,14 +59,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.706793e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.772354e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.774879e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.693782e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.750237e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.752633e+05 ) sec^-1 MeanMatrixElemValue = ( 6.664703e+00 +- 5.072736e+00 ) GeV^-4 -TOTAL : 1.890952 sec - 6,410,363,982 cycles # 2.999 GHz - 13,551,497,595 instructions # 2.11 insn per cycle - 2.194621569 seconds time elapsed +TOTAL : 1.885046 sec + 6,430,536,190 cycles # 3.024 GHz + 13,410,448,299 instructions # 2.09 insn per cycle + 2.186495562 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/fgcheck.exe 2 64 2 @@ -81,24 +81,24 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.808396e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.809257e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.809257e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.896574e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.897472e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.897472e+02 ) sec^-1 MeanMatrixElemValue = ( 4.059969e+00 +- 2.367799e+00 ) GeV^-4 -TOTAL : 28.245560 sec - 87,427,417,519 cycles # 3.095 GHz - 135,763,305,163 instructions # 1.55 insn per cycle - 28.250890709 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:15818) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 27.823875 sec + 86,121,024,305 cycles # 3.095 GHz + 136,130,940,684 instructions # 1.58 insn per cycle + 27.829007329 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:15932) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627534e-04 -Avg ME (F77/C++) = 6.6275341691775930E-004 -Relative difference = 2.5526476807452827e-08 +Avg ME (C++/C++) = 6.627535e-04 +Avg ME (F77/C++) = 6.6275349672524630E-004 +Relative difference = 4.9411338183416744e-09 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/check.exe -p 64 256 1 OMP= @@ -107,15 +107,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.170907e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.184544e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.184544e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.179629e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.192771e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.192771e+03 ) sec^-1 MeanMatrixElemValue = ( 4.059962e+00 +- 2.367792e+00 ) GeV^-4 -TOTAL : 2.306037 sec - 6,711,853,340 cycles # 2.913 GHz - 19,316,667,501 instructions # 2.88 insn per cycle - 2.311182268 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:69621) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 2.295614 sec + 6,686,096,543 cycles # 2.912 GHz + 19,271,682,618 instructions # 2.88 insn per cycle + 2.300291912 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:69534) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -133,15 +133,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.523529e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.529668e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.529668e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.531625e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.537735e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.537735e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060903e+00 +- 2.367376e+00 ) GeV^-4 -TOTAL : 1.085843 sec - 3,147,043,130 cycles # 2.888 GHz - 6,736,344,397 instructions # 2.14 insn per cycle - 1.090647492 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:48453) (512y: 0) (512z: 0) +TOTAL : 1.080134 sec + 3,114,255,706 cycles # 2.875 GHz + 6,664,280,015 instructions # 2.14 insn per cycle + 1.084789912 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:47803) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -159,15 +159,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.852671e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.861573e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.861573e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.841445e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.850149e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.850149e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060903e+00 +- 2.367376e+00 ) GeV^-4 -TOTAL : 0.894424 sec - 2,600,790,667 cycles # 2.894 GHz - 5,909,273,046 instructions # 2.27 insn per cycle - 0.899564792 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:41990) (512y: 22) (512z: 0) +TOTAL : 0.899888 sec + 2,571,109,745 cycles # 2.845 GHz + 5,850,913,153 instructions # 2.28 insn per cycle + 0.904914307 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:41536) (512y: 13) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -185,15 +185,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.531407e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.537279e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.537279e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.550620e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.556902e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.556902e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060905e+00 +- 2.367377e+00 ) GeV^-4 -TOTAL : 1.080343 sec - 2,070,458,421 cycles # 1.909 GHz - 3,446,830,194 instructions # 1.66 insn per cycle - 1.085532910 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4586) (512y: 2) (512z:44723) +TOTAL : 1.067211 sec + 2,037,305,474 cycles # 1.903 GHz + 3,373,327,271 instructions # 1.66 insn per cycle + 1.071975743 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4191) (512y: 5) (512z:44245) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt index d6bc389031..8e3ac4399e 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt @@ -35,7 +35,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2023-07-20_21:47:49 +DATE: 2023-06-16_23:15:08 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -43,14 +43,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.523512e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.569039e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.573575e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.536252e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.573442e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.577601e+05 ) sec^-1 MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 -TOTAL : 0.517236 sec - 2,143,058,614 cycles # 2.862 GHz - 2,882,005,781 instructions # 1.34 insn per cycle - 0.808411873 seconds time elapsed +TOTAL : 0.513649 sec + 2,145,771,085 cycles # 2.879 GHz + 2,862,547,641 instructions # 1.33 insn per cycle + 0.805011702 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/gcheck.exe -p 64 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -59,14 +59,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.570658e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.633852e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.636335e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.581647e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.635703e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.638045e+05 ) sec^-1 MeanMatrixElemValue = ( 6.664703e+00 +- 5.072736e+00 ) GeV^-4 -TOTAL : 1.900965 sec - 6,437,362,378 cycles # 2.999 GHz - 12,983,322,283 instructions # 2.02 insn per cycle - 2.204250034 seconds time elapsed +TOTAL : 1.893721 sec + 6,500,188,004 cycles # 3.036 GHz + 12,690,177,194 instructions # 1.95 insn per cycle + 2.199182146 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/fgcheck.exe 2 64 2 @@ -81,15 +81,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.789584e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.790453e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.790453e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.890367e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.891225e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.891225e+02 ) sec^-1 MeanMatrixElemValue = ( 4.059969e+00 +- 2.367799e+00 ) GeV^-4 -TOTAL : 28.336990 sec - 87,258,521,219 cycles # 3.079 GHz - 136,040,134,822 instructions # 1.56 insn per cycle - 28.341949952 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:15924) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 27.851215 sec + 85,856,337,659 cycles # 3.083 GHz + 136,047,957,548 instructions # 1.58 insn per cycle + 27.856280727 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:15933) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -107,15 +107,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.001228e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.014594e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.014594e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.106679e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.119068e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.119068e+03 ) sec^-1 MeanMatrixElemValue = ( 4.059963e+00 +- 2.367792e+00 ) GeV^-4 -TOTAL : 2.351723 sec - 6,815,378,913 cycles # 2.893 GHz - 19,382,378,975 instructions # 2.84 insn per cycle - 2.356786373 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:69627) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 2.316749 sec + 6,781,000,176 cycles # 2.924 GHz + 19,316,989,732 instructions # 2.85 insn per cycle + 2.321493873 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:69471) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -133,15 +133,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.552515e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.559115e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.559115e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.581478e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.587987e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.587987e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060903e+00 +- 2.367376e+00 ) GeV^-4 -TOTAL : 1.065481 sec - 3,086,498,221 cycles # 2.886 GHz - 6,664,423,542 instructions # 2.16 insn per cycle - 1.070598506 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:47280) (512y: 0) (512z: 0) +TOTAL : 1.045728 sec + 3,042,131,837 cycles # 2.898 GHz + 6,594,840,346 instructions # 2.17 insn per cycle + 1.050898181 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:46795) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -159,15 +159,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.838852e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.847536e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.847536e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.870529e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.879536e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.879536e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060903e+00 +- 2.367376e+00 ) GeV^-4 -TOTAL : 0.901014 sec - 2,609,156,422 cycles # 2.885 GHz - 5,914,055,748 instructions # 2.27 insn per cycle - 0.905734024 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:41456) (512y: 13) (512z: 0) +TOTAL : 0.885640 sec + 2,578,942,088 cycles # 2.899 GHz + 5,854,799,048 instructions # 2.27 insn per cycle + 0.890682953 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:41080) (512y: 9) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -185,15 +185,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.536588e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.542532e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.542532e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.554834e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.561058e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.561058e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060905e+00 +- 2.367377e+00 ) GeV^-4 -TOTAL : 1.076479 sec - 2,060,987,199 cycles # 1.909 GHz - 3,432,456,550 instructions # 1.67 insn per cycle - 1.081321956 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3734) (512y: 2) (512z:44235) +TOTAL : 1.063798 sec + 2,024,256,431 cycles # 1.896 GHz + 3,083,285,385 instructions # 1.52 insn per cycle + 1.068833353 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3372) (512y: 17) (512z:39424) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt index 8acd5c32ac..b408e7e538 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt @@ -35,7 +35,7 @@ CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2023-07-20_21:13:44 +DATE: 2023-06-16_22:58:34 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -43,14 +43,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.493476e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.527635e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.530084e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.488561e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.521964e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.524427e+05 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.548664 sec - 2,296,615,097 cycles # 2.912 GHz - 3,133,080,453 instructions # 1.36 insn per cycle - 1.109827853 seconds time elapsed +TOTAL : 0.554107 sec + 2,279,290,212 cycles # 2.876 GHz + 3,137,052,799 instructions # 1.38 insn per cycle + 0.851081146 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -59,14 +59,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.137059e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.178750e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.180436e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.134416e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.168392e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.169702e+05 ) sec^-1 MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.103624 sec - 10,197,041,650 cycles # 3.023 GHz - 20,944,038,216 instructions # 2.05 insn per cycle - 3.431441466 seconds time elapsed +TOTAL : 3.065812 sec + 10,122,925,933 cycles # 3.040 GHz + 20,842,538,183 instructions # 2.06 insn per cycle + 3.387502173 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 @@ -81,15 +81,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.976605e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.977773e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.977773e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.978031e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.979322e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.979322e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 8.311679 sec - 25,618,428,544 cycles # 3.083 GHz - 79,194,835,903 instructions # 3.09 insn per cycle - 8.343206640 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 4705) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 8.302095 sec + 25,693,038,802 cycles # 3.094 GHz + 79,192,524,178 instructions # 3.08 insn per cycle + 8.307297645 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 4706) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -107,15 +107,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.723307e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.727516e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.727516e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.739227e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.743408e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.743408e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.416300 sec - 12,762,573,701 cycles # 2.887 GHz - 38,544,035,438 instructions # 3.02 insn per cycle - 4.473092722 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:13113) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.404759 sec + 12,783,705,279 cycles # 2.903 GHz + 38,505,609,809 instructions # 3.01 insn per cycle + 4.410187217 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:13076) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -133,15 +133,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.669183e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.691470e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.691470e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.779933e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.804166e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.804166e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.902360 sec - 5,505,645,553 cycles # 2.887 GHz - 13,634,272,172 instructions # 2.48 insn per cycle - 2.054764792 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10928) (512y: 0) (512z: 0) +TOTAL : 1.879373 sec + 5,476,712,305 cycles # 2.908 GHz + 13,620,238,606 instructions # 2.49 insn per cycle + 1.884513914 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10865) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -159,15 +159,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.613752e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.643159e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.643159e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.768885e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.800184e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.800184e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.758705 sec - 4,888,946,375 cycles # 2.839 GHz - 12,283,305,306 instructions # 2.51 insn per cycle - 1.935364169 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10646) (512y: 26) (512z: 0) +TOTAL : 1.690734 sec + 4,867,518,430 cycles # 2.873 GHz + 12,266,678,987 instructions # 2.52 insn per cycle + 1.696100975 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10583) (512y: 20) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -185,15 +185,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.561725e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.579740e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.579740e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.632753e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.650795e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.650795e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.185537 sec - 4,141,609,668 cycles # 1.895 GHz - 6,379,003,093 instructions # 1.54 insn per cycle - 2.239359646 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1450) (512y: 81) (512z:10027) +TOTAL : 2.159522 sec + 4,131,175,813 cycles # 1.911 GHz + 6,362,042,970 instructions # 1.54 insn per cycle + 2.164878251 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1422) (512y: 57) (512z: 9944) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt index 64e27e5bed..a0ec009068 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt @@ -35,7 +35,7 @@ CUDACPP_BUILDDIR='build.512z_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2023-07-20_21:14:22 +DATE: 2023-06-16_22:59:10 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -43,14 +43,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.521585e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.555965e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.558589e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.495757e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.531890e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.534521e+05 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.553938 sec - 2,257,469,093 cycles # 2.851 GHz - 3,123,134,980 instructions # 1.38 insn per cycle - 1.249911595 seconds time elapsed +TOTAL : 0.553147 sec + 2,267,397,908 cycles # 2.871 GHz + 3,111,867,480 instructions # 1.37 insn per cycle + 0.849472244 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -59,14 +59,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.137839e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.179467e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.181149e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.149132e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.183375e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.184726e+05 ) sec^-1 MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.069502 sec - 10,173,875,442 cycles # 3.045 GHz - 20,993,844,495 instructions # 2.06 insn per cycle - 3.397423056 seconds time elapsed +TOTAL : 3.059498 sec + 10,031,596,373 cycles # 3.007 GHz + 20,916,916,227 instructions # 2.09 insn per cycle + 3.392815949 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 @@ -81,15 +81,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.984298e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.985547e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.985547e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.978511e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.979800e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.979800e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 8.277717 sec - 25,595,849,342 cycles # 3.092 GHz - 79,218,225,879 instructions # 3.09 insn per cycle - 8.309194900 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 4380) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 8.301620 sec + 25,591,925,031 cycles # 3.083 GHz + 79,211,596,551 instructions # 3.10 insn per cycle + 8.306715675 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 4378) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -107,15 +107,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.715752e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.719942e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.719942e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.744925e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.749316e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.749316e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.443284 sec - 12,799,192,184 cycles # 2.889 GHz - 38,491,370,753 instructions # 3.01 insn per cycle - 4.898248765 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:12885) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.389824 sec + 12,788,106,524 cycles # 2.911 GHz + 38,452,416,346 instructions # 3.01 insn per cycle + 4.395123954 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:12869) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -133,15 +133,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.594143e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.616711e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.616711e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.672327e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.695790e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.695790e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.918804 sec - 5,566,481,163 cycles # 2.895 GHz - 13,757,355,092 instructions # 2.47 insn per cycle - 2.287391584 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11116) (512y: 0) (512z: 0) +TOTAL : 1.902528 sec + 5,504,597,953 cycles # 2.888 GHz + 13,712,972,642 instructions # 2.49 insn per cycle + 1.907990624 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10970) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -159,15 +159,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.656562e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.684472e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.684472e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.759017e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.790204e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.790204e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.715426 sec - 4,942,699,366 cycles # 2.883 GHz - 12,412,821,928 instructions # 2.51 insn per cycle - 2.080868312 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10722) (512y: 209) (512z: 0) +TOTAL : 1.691699 sec + 4,899,459,456 cycles # 2.890 GHz + 12,367,541,245 instructions # 2.52 insn per cycle + 1.696806589 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10604) (512y: 176) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -185,15 +185,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.551633e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.570185e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.570185e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.611473e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.629905e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.629905e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.189584 sec - 4,156,836,001 cycles # 1.900 GHz - 6,482,745,865 instructions # 1.56 insn per cycle - 2.475368400 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1360) (512y: 194) (512z:10077) +TOTAL : 2.165271 sec + 4,135,051,390 cycles # 1.906 GHz + 6,441,618,375 instructions # 1.56 insn per cycle + 2.170469997 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1326) (512y: 167) (512z:10035) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt index 386de00864..b4357ddd50 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt @@ -35,7 +35,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2023-07-20_21:16:56 +DATE: 2023-07-20_18:07:29 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -43,14 +43,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProce Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.075414e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.075803e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.075897e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.075520e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.075910e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.076013e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 2.776374 sec - 8,450,187,902 cycles # 3.030 GHz - 18,717,301,384 instructions # 2.22 insn per cycle - 3.289917814 seconds time elapsed +TOTAL : 2.453144 sec + 8,415,470,300 cycles # 3.028 GHz + 18,596,590,595 instructions # 2.21 insn per cycle + 2.869032184 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe -p 1 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -59,14 +59,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProce Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.248444e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.251093e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.251296e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.213469e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.216074e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.216272e+03 ) sec^-1 MeanMatrixElemValue = ( 1.856249e-04 +- 8.329951e-05 ) GeV^-6 -TOTAL : 4.022900 sec - 13,299,973,484 cycles # 3.054 GHz - 27,505,266,107 instructions # 2.07 insn per cycle - 4.411404366 seconds time elapsed +TOTAL : 4.030919 sec + 13,254,448,164 cycles # 3.037 GHz + 31,481,022,477 instructions # 2.38 insn per cycle + 4.420384500 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 @@ -81,14 +81,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.418593e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.418933e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.418933e+01 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.420184e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.420488e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.420488e+01 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 6.421316 sec - 19,281,355,551 cycles # 3.070 GHz - 54,054,618,817 instructions # 2.80 insn per cycle - 6.473505590 seconds time elapsed +TOTAL : 6.283928 sec + 19,211,915,422 cycles # 3.056 GHz + 54,053,697,564 instructions # 2.81 insn per cycle + 6.288648290 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:32341) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/runTest.exe @@ -107,14 +107,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.637177e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.637305e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.637305e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.622967e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.623075e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.623075e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 3.250110 sec - 9,970,262,968 cycles # 3.082 GHz - 27,086,632,463 instructions # 2.72 insn per cycle - 3.513146338 seconds time elapsed +TOTAL : 3.259346 sec + 9,975,892,872 cycles # 3.058 GHz + 27,085,562,450 instructions # 2.72 insn per cycle + 3.574286196 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:96383) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/runTest.exe @@ -133,14 +133,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.528971e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.529493e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.529493e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.489179e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.489682e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.489682e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.502753 sec - 4,337,080,922 cycles # 2.877 GHz - 9,669,696,230 instructions # 2.23 insn per cycle - 1.750891389 seconds time elapsed +TOTAL : 1.518121 sec + 4,328,860,512 cycles # 2.844 GHz + 9,668,801,864 instructions # 2.23 insn per cycle + 1.537419261 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:84061) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/runTest.exe @@ -159,14 +159,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.036088e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.036768e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.036768e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.961229e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.961898e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.961898e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.475519 sec - 3,831,373,637 cycles # 2.902 GHz - 8,620,226,631 instructions # 2.25 insn per cycle - 1.519676367 seconds time elapsed +TOTAL : 1.340022 sec + 3,824,515,763 cycles # 2.847 GHz + 8,619,182,124 instructions # 2.25 insn per cycle + 1.604248171 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:83759) (512y: 36) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/runTest.exe @@ -185,14 +185,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.719802e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.720517e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.720517e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.617654e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.618403e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.618403e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.437252 sec - 2,717,419,055 cycles # 1.899 GHz - 4,339,672,727 instructions # 1.60 insn per cycle - 1.506031729 seconds time elapsed +TOTAL : 1.465230 sec + 2,714,369,128 cycles # 1.849 GHz + 4,339,136,277 instructions # 1.60 insn per cycle + 1.492829343 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1902) (512y: 91) (512z:83007) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt index 00090ba80a..1c0184de14 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt @@ -35,7 +35,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2023-07-20_21:56:21 +DATE: 2023-06-16_23:23:12 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -47,14 +47,14 @@ WARNING! Set grid in Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gp Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.066653e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.067834e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.067834e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.061497e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.062582e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.062582e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 2.433458 sec - 8,289,150,212 cycles # 3.001 GHz - 18,428,452,007 instructions # 2.22 insn per cycle - 2.821164282 seconds time elapsed +TOTAL : 2.433988 sec + 8,336,076,363 cycles # 3.005 GHz + 17,014,813,881 instructions # 2.04 insn per cycle + 2.833188841 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe -p 1 256 1 --bridge WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost @@ -71,14 +71,14 @@ WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.241947e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.281401e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.281401e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.176810e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.216166e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.216166e+03 ) sec^-1 MeanMatrixElemValue = ( 1.856249e-04 +- 8.329951e-05 ) GeV^-6 -TOTAL : 4.009143 sec - 13,306,017,007 cycles # 3.067 GHz - 30,061,195,219 instructions # 2.26 insn per cycle - 4.394800009 seconds time elapsed +TOTAL : 4.016312 sec + 13,237,795,425 cycles # 3.038 GHz + 28,853,451,424 instructions # 2.18 insn per cycle + 4.417206461 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 @@ -94,15 +94,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.397225e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.397522e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.397522e+01 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.426091e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.426409e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.426409e+01 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 6.301255 sec - 19,330,325,126 cycles # 3.069 GHz - 54,055,630,211 instructions # 2.80 insn per cycle - 6.306169233 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:32341) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.283292 sec + 19,229,856,491 cycles # 3.063 GHz + 54,057,430,365 instructions # 2.81 insn per cycle + 6.287771357 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:32342) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -121,15 +121,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.637182e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.637308e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.637308e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.625803e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.625919e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.625919e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 3.239585 sec - 9,968,370,704 cycles # 3.074 GHz - 27,085,960,623 instructions # 2.72 insn per cycle - 3.244254301 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:96383) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.253969 sec + 9,967,840,687 cycles # 3.060 GHz + 27,083,402,734 instructions # 2.72 insn per cycle + 3.258914943 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:96346) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -148,15 +148,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.474512e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.475053e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.475053e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.555664e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.556218e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.556218e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.525741 sec - 4,373,743,150 cycles # 2.861 GHz - 9,669,755,462 instructions # 2.21 insn per cycle - 1.530667246 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84061) (512y: 0) (512z: 0) +TOTAL : 1.491530 sec + 4,286,204,468 cycles # 2.869 GHz + 9,667,785,714 instructions # 2.26 insn per cycle + 1.496036399 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:83998) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -175,15 +175,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.992020e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.992722e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.992722e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.941069e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.941824e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.941824e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.329714 sec - 3,841,936,897 cycles # 2.882 GHz - 8,619,582,166 instructions # 2.24 insn per cycle - 1.334674864 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:83759) (512y: 36) (512z: 0) +TOTAL : 1.348647 sec + 3,839,497,436 cycles # 2.840 GHz + 8,618,118,436 instructions # 2.24 insn per cycle + 1.353595580 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:83696) (512y: 30) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -202,15 +202,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.761554e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.762293e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.762293e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.666454e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.667164e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.667164e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.410799 sec - 2,708,976,181 cycles # 1.915 GHz - 4,338,890,620 instructions # 1.60 insn per cycle - 1.415668781 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1902) (512y: 91) (512z:83007) +TOTAL : 1.447325 sec + 2,707,628,155 cycles # 1.868 GHz + 4,338,013,213 instructions # 1.60 insn per cycle + 1.451919961 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1874) (512y: 67) (512z:82924) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt index d58ac24147..1144007b2b 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt @@ -35,7 +35,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2023-07-20_21:18:03 +DATE: 2023-06-16_23:02:29 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -43,14 +43,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProce Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.063592e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.063984e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.064083e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.065553e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.065928e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.066074e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 2.711832 sec - 8,282,903,722 cycles # 3.017 GHz - 17,389,272,592 instructions # 2.10 insn per cycle - 3.201673177 seconds time elapsed +TOTAL : 2.452549 sec + 8,383,872,432 cycles # 3.005 GHz + 18,091,785,273 instructions # 2.16 insn per cycle + 2.848067694 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/gcheck.exe -p 1 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -59,14 +59,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProce Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.262871e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.265261e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.265471e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.225084e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.227042e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.227224e+03 ) sec^-1 MeanMatrixElemValue = ( 1.856249e-04 +- 8.329951e-05 ) GeV^-6 -TOTAL : 4.021668 sec - 13,273,055,273 cycles # 3.049 GHz - 31,171,061,319 instructions # 2.35 insn per cycle - 4.411597476 seconds time elapsed +TOTAL : 4.032176 sec + 13,272,438,956 cycles # 3.042 GHz + 28,712,791,562 instructions # 2.16 insn per cycle + 4.421079188 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 @@ -81,15 +81,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.406838e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.407149e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.407149e+01 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.506737e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.507047e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.507047e+01 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 6.324034 sec - 19,328,514,497 cycles # 3.068 GHz - 54,078,525,576 instructions # 2.80 insn per cycle - 6.345660721 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:32247) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.213859 sec + 19,274,588,808 cycles # 3.101 GHz + 54,076,478,702 instructions # 2.81 insn per cycle + 6.218544506 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:32261) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -107,15 +107,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.623788e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.623901e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.623901e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.638242e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.638356e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.638356e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 3.284106 sec - 10,019,161,507 cycles # 3.072 GHz - 27,081,365,133 instructions # 2.70 insn per cycle - 3.352749734 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:96275) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.237531 sec + 9,921,784,579 cycles # 3.067 GHz + 27,077,809,738 instructions # 2.73 insn per cycle + 3.242137028 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:96273) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -133,15 +133,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.515809e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.516476e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.516476e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.488911e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.489451e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.489451e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.559133 sec - 4,344,155,234 cycles # 2.871 GHz - 9,682,282,495 instructions # 2.23 insn per cycle - 1.707269281 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84290) (512y: 0) (512z: 0) +TOTAL : 1.519935 sec + 4,376,850,073 cycles # 2.874 GHz + 9,677,370,071 instructions # 2.21 insn per cycle + 1.524967874 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84092) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -159,15 +159,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.960936e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.961610e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.961610e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.924415e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.925092e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.925092e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.396149 sec - 3,860,666,833 cycles # 2.877 GHz - 8,630,675,976 instructions # 2.24 insn per cycle - 1.761693919 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:83800) (512y: 208) (512z: 0) +TOTAL : 1.350849 sec + 3,841,551,477 cycles # 2.839 GHz + 8,626,542,730 instructions # 2.25 insn per cycle + 1.355459933 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:83791) (512y: 180) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -185,15 +185,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.658513e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.659270e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.659270e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.671815e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.672525e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.672525e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.464706 sec - 2,712,269,993 cycles # 1.859 GHz - 4,349,187,858 instructions # 1.60 insn per cycle - 1.524887984 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1879) (512y: 186) (512z:83139) +TOTAL : 1.450398 sec + 2,709,666,226 cycles # 1.865 GHz + 4,344,182,027 instructions # 1.60 insn per cycle + 1.455504193 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1787) (512y: 166) (512z:83071) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt index ef8a51d42b..4ef1b474b5 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt @@ -35,7 +35,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2023-07-20_21:19:10 +DATE: 2023-06-16_23:03:32 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -43,14 +43,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProce Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.809480e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.810322e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.810578e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.805716e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.806625e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.806872e+02 ) sec^-1 MeanMatrixElemValue = ( 1.186984e-05 +- 9.824899e-06 ) GeV^-6 -TOTAL : 1.973685 sec - 5,906,535,565 cycles # 3.008 GHz - 12,133,604,991 instructions # 2.05 insn per cycle - 2.498552743 seconds time elapsed +TOTAL : 1.680704 sec + 5,915,505,957 cycles # 3.011 GHz + 12,343,112,691 instructions # 2.09 insn per cycle + 2.022067086 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe -p 1 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -59,14 +59,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProce Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.307028e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.307806e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.307891e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.312297e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.312960e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.313034e+04 ) sec^-1 MeanMatrixElemValue = ( 1.856829e-04 +- 8.333435e-05 ) GeV^-6 -TOTAL : 1.936729 sec - 6,683,815,963 cycles # 2.999 GHz - 13,927,578,496 instructions # 2.08 insn per cycle - 2.286392107 seconds time elapsed +TOTAL : 1.956042 sec + 6,779,291,735 cycles # 3.014 GHz + 14,381,222,230 instructions # 2.12 insn per cycle + 2.309303925 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 @@ -81,15 +81,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.815946e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.816214e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.816214e+01 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.908521e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.908794e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.908794e+01 ) sec^-1 MeanMatrixElemValue = ( 1.187013e-05 +- 9.825040e-06 ) GeV^-6 -TOTAL : 5.993608 sec - 18,385,044,752 cycles # 3.065 GHz - 53,645,684,340 instructions # 2.92 insn per cycle - 6.063406534 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:20325) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 5.935380 sec + 18,280,225,429 cycles # 3.079 GHz + 53,644,572,574 instructions # 2.93 insn per cycle + 5.940879711 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:20329) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -107,15 +107,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.607082e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.607605e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.607605e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.628533e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.628996e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.628996e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187013e-05 +- 9.825037e-06 ) GeV^-6 -TOTAL : 1.498986 sec - 4,529,019,663 cycles # 3.072 GHz - 13,769,311,367 instructions # 3.04 insn per cycle - 2.165572613 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:96967) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 1.461345 sec + 4,500,201,036 cycles # 3.073 GHz + 13,763,590,198 instructions # 3.06 insn per cycle + 1.466106403 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:96927) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -133,15 +133,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.033501e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.035223e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.035223e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.026530e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.028270e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.028270e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187188e-05 +- 9.826769e-06 ) GeV^-6 -TOTAL : 1.026465 sec - 2,197,575,388 cycles # 2.885 GHz - 4,873,232,721 instructions # 2.22 insn per cycle - 1.158741236 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84349) (512y: 0) (512z: 0) +TOTAL : 0.764097 sec + 2,191,663,686 cycles # 2.869 GHz + 4,871,112,775 instructions # 2.22 insn per cycle + 0.768775887 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84275) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -159,15 +159,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.917974e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.920193e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.920193e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.934823e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.937028e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.937028e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187188e-05 +- 9.826769e-06 ) GeV^-6 -TOTAL : 0.678689 sec - 1,938,127,607 cycles # 2.848 GHz - 4,345,884,991 instructions # 2.24 insn per cycle - 0.797476333 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84005) (512y: 39) (512z: 0) +TOTAL : 0.671145 sec + 1,934,376,171 cycles # 2.867 GHz + 4,342,607,330 instructions # 2.24 insn per cycle + 0.675706420 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:83944) (512y: 33) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -185,15 +185,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.302777e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.305068e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.305068e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.285767e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.288040e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.288040e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187188e-05 +- 9.826768e-06 ) GeV^-6 -TOTAL : 0.756939 sec - 1,368,054,927 cycles # 1.862 GHz - 2,196,065,001 instructions # 1.61 insn per cycle - 0.880346563 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2211) (512y: 56) (512z:83138) +TOTAL : 0.730681 sec + 1,366,589,345 cycles # 1.861 GHz + 2,193,139,079 instructions # 1.60 insn per cycle + 0.735224545 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2170) (512y: 41) (512z:83044) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt index c6d548c3d9..6f1772cda6 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt @@ -35,7 +35,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2023-07-20_21:57:24 +DATE: 2023-06-16_23:24:16 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -47,14 +47,14 @@ WARNING! Set grid in Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gp Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.657116e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.658992e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.658992e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.655466e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.657233e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.657233e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187094e-05 +- 9.825664e-06 ) GeV^-6 -TOTAL : 1.660024 sec - 5,852,474,741 cycles # 3.017 GHz - 11,624,674,496 instructions # 1.99 insn per cycle - 2.000214747 seconds time elapsed +TOTAL : 1.660443 sec + 5,832,575,389 cycles # 3.001 GHz + 12,080,610,113 instructions # 2.07 insn per cycle + 2.000524939 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe -p 1 256 1 --bridge WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost @@ -71,14 +71,14 @@ WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.331531e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.344755e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.344755e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.306235e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.319271e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.319271e+04 ) sec^-1 MeanMatrixElemValue = ( 1.856441e-04 +- 8.331096e-05 ) GeV^-6 -TOTAL : 1.932203 sec - 6,697,497,826 cycles # 3.025 GHz - 13,576,248,505 instructions # 2.03 insn per cycle - 2.271890988 seconds time elapsed +TOTAL : 1.939786 sec + 6,710,577,294 cycles # 3.010 GHz + 14,122,488,130 instructions # 2.10 insn per cycle + 2.286693438 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 @@ -94,15 +94,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.850646e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.850918e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.850918e+01 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.841510e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.841775e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.841775e+01 ) sec^-1 MeanMatrixElemValue = ( 1.187013e-05 +- 9.825040e-06 ) GeV^-6 -TOTAL : 5.972319 sec - 18,409,448,073 cycles # 3.082 GHz - 53,646,422,116 instructions # 2.91 insn per cycle - 5.976948642 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:20325) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 5.977425 sec + 18,376,830,240 cycles # 3.073 GHz + 53,645,367,177 instructions # 2.92 insn per cycle + 5.982025088 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:20329) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -121,15 +121,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.615749e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.616202e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.616202e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.623748e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.624254e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.624254e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187013e-05 +- 9.825037e-06 ) GeV^-6 -TOTAL : 1.466788 sec - 4,522,761,853 cycles # 3.077 GHz - 13,768,581,639 instructions # 3.04 insn per cycle - 1.471851311 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:96967) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 1.463452 sec + 4,497,433,282 cycles # 3.068 GHz + 13,764,580,046 instructions # 3.06 insn per cycle + 1.468085626 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:96927) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -148,15 +148,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.008075e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.009853e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.009853e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.987628e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.989331e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.989331e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187188e-05 +- 9.826769e-06 ) GeV^-6 -TOTAL : 0.759899 sec - 2,204,570,668 cycles # 2.888 GHz - 4,872,443,522 instructions # 2.21 insn per cycle - 0.764398259 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84349) (512y: 0) (512z: 0) +TOTAL : 0.762831 sec + 2,199,477,118 cycles # 2.869 GHz + 4,871,433,971 instructions # 2.21 insn per cycle + 0.767671601 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84275) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -175,15 +175,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.690755e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.693120e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.693120e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.924776e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.926917e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.926917e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187188e-05 +- 9.826769e-06 ) GeV^-6 -TOTAL : 0.693606 sec - 1,939,579,933 cycles # 2.782 GHz - 4,344,789,807 instructions # 2.24 insn per cycle - 0.698719969 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84005) (512y: 39) (512z: 0) +TOTAL : 0.672826 sec + 1,945,457,623 cycles # 2.877 GHz + 4,343,708,064 instructions # 2.23 insn per cycle + 0.677656533 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:83944) (512y: 33) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -202,15 +202,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.290159e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.292456e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.292456e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.377484e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.379728e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.379728e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187188e-05 +- 9.826768e-06 ) GeV^-6 -TOTAL : 0.731273 sec - 1,367,323,262 cycles # 1.861 GHz - 2,195,160,472 instructions # 1.61 insn per cycle - 0.736171332 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2211) (512y: 56) (512z:83138) +TOTAL : 0.723682 sec + 1,379,747,419 cycles # 1.897 GHz + 2,194,225,604 instructions # 1.59 insn per cycle + 0.728534863 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2170) (512y: 41) (512z:83044) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt index 69a28cc4ae..af71691b16 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt @@ -35,7 +35,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2023-07-20_21:20:01 +DATE: 2023-06-16_23:04:19 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -43,14 +43,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProce Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.676233e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.677215e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.677462e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.674461e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.675325e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.675581e+02 ) sec^-1 MeanMatrixElemValue = ( 1.186984e-05 +- 9.824899e-06 ) GeV^-6 -TOTAL : 1.863382 sec - 5,922,076,212 cycles # 2.981 GHz - 12,211,939,133 instructions # 2.06 insn per cycle - 2.842498391 seconds time elapsed +TOTAL : 1.698247 sec + 5,886,551,862 cycles # 2.951 GHz + 11,210,609,464 instructions # 1.90 insn per cycle + 2.052173907 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/gcheck.exe -p 1 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -59,14 +59,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProce Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.307574e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.308347e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.308431e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.286680e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.287310e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.287386e+04 ) sec^-1 MeanMatrixElemValue = ( 1.856829e-04 +- 8.333435e-05 ) GeV^-6 -TOTAL : 1.952788 sec - 6,786,217,005 cycles # 3.016 GHz - 14,312,897,389 instructions # 2.11 insn per cycle - 2.307674288 seconds time elapsed +TOTAL : 1.961982 sec + 6,772,975,134 cycles # 3.012 GHz + 13,750,292,328 instructions # 2.03 insn per cycle + 2.305639635 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 @@ -81,15 +81,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.956850e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.957127e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.957127e+01 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.000555e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.000832e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.000832e+01 ) sec^-1 MeanMatrixElemValue = ( 1.187013e-05 +- 9.825040e-06 ) GeV^-6 -TOTAL : 5.942260 sec - 18,253,531,913 cycles # 3.091 GHz - 53,663,756,742 instructions # 2.94 insn per cycle - 5.974908054 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:20482) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 5.871994 sec + 18,105,311,919 cycles # 3.082 GHz + 53,664,310,633 instructions # 2.96 insn per cycle + 5.876674372 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:20543) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -107,15 +107,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.638541e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.639067e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.639067e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.589372e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.589815e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.589815e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187013e-05 +- 9.825037e-06 ) GeV^-6 -TOTAL : 1.467595 sec - 4,500,074,415 cycles # 3.079 GHz - 13,761,672,201 instructions # 3.06 insn per cycle - 1.720757201 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:96701) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 1.476565 sec + 4,561,855,617 cycles # 3.083 GHz + 13,756,278,948 instructions # 3.02 insn per cycle + 1.481667004 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:96740) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -133,15 +133,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.992845e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.994631e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.994631e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.993067e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.994817e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.994817e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187188e-05 +- 9.826769e-06 ) GeV^-6 -TOTAL : 0.788600 sec - 2,202,642,022 cycles # 2.880 GHz - 4,881,104,849 instructions # 2.22 insn per cycle - 0.833308615 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84889) (512y: 0) (512z: 0) +TOTAL : 0.761742 sec + 2,188,633,514 cycles # 2.862 GHz + 4,877,048,177 instructions # 2.23 insn per cycle + 0.766373792 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84908) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -159,15 +159,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.882090e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.884267e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.884267e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.971095e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.973270e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.973270e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187188e-05 +- 9.826769e-06 ) GeV^-6 -TOTAL : 0.675184 sec - 1,960,536,879 cycles # 2.887 GHz - 4,352,229,515 instructions # 2.22 insn per cycle - 0.724414266 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84720) (512y: 27) (512z: 0) +TOTAL : 0.668601 sec + 1,935,360,358 cycles # 2.879 GHz + 4,348,847,241 instructions # 2.25 insn per cycle + 0.673510965 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84638) (512y: 22) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -185,15 +185,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.371724e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.374139e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.374139e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.103833e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.106308e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.106308e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187188e-05 +- 9.826768e-06 ) GeV^-6 -TOTAL : 1.073490 sec - 1,373,484,416 cycles # 1.890 GHz - 2,204,706,161 instructions # 1.61 insn per cycle - 1.130165802 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2992) (512y: 36) (512z:83278) +TOTAL : 0.749331 sec + 1,373,934,827 cycles # 1.825 GHz + 2,200,572,670 instructions # 1.60 insn per cycle + 0.754227181 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2740) (512y: 23) (512z:83143) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt index 3b7a5f7e32..3a9dd35695 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt @@ -35,7 +35,7 @@ CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2023-07-20_21:20:53 +DATE: 2023-06-16_23:05:06 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -43,14 +43,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProce Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.695072e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.695587e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.695722e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.693625e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.694288e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.694409e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 2.198415 sec - 7,595,723,789 cycles # 3.005 GHz - 15,434,651,054 instructions # 2.03 insn per cycle - 2.585405134 seconds time elapsed +TOTAL : 2.196059 sec + 7,620,057,050 cycles # 3.016 GHz + 15,936,153,152 instructions # 2.09 insn per cycle + 2.583018062 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/gcheck.exe -p 1 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -59,14 +59,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProce Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.110308e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.110626e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.110657e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.108976e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.109249e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.109272e+04 ) sec^-1 MeanMatrixElemValue = ( 1.856249e-04 +- 8.329951e-05 ) GeV^-6 -TOTAL : 3.430566 sec - 11,369,052,740 cycles # 3.021 GHz - 24,300,231,616 instructions # 2.14 insn per cycle - 3.819891625 seconds time elapsed +TOTAL : 3.430946 sec + 11,408,477,495 cycles # 3.034 GHz + 26,313,998,938 instructions # 2.31 insn per cycle + 3.820144649 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 @@ -81,15 +81,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.307650e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.307953e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.307953e+01 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.362446e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.362744e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.362744e+01 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 6.363010 sec - 19,457,920,688 cycles # 3.058 GHz - 54,293,208,084 instructions # 2.79 insn per cycle - 6.368095274 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:31976) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.325318 sec + 19,430,907,237 cycles # 3.071 GHz + 54,292,603,982 instructions # 2.79 insn per cycle + 6.329867372 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:31977) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -107,15 +107,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.636763e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.636884e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.636884e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.619143e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.619255e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.619255e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 3.242091 sec - 9,492,901,133 cycles # 2.925 GHz - 26,117,896,000 instructions # 2.75 insn per cycle - 3.247292109 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:95956) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.268613 sec + 9,472,675,959 cycles # 2.895 GHz + 26,115,271,176 instructions # 2.76 insn per cycle + 3.273561039 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:95919) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -133,15 +133,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.654193e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.654827e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.654827e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.790808e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.791484e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.791484e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 1.451001 sec - 4,181,322,087 cycles # 2.876 GHz - 9,332,744,933 instructions # 2.23 insn per cycle - 1.455636907 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:83829) (512y: 0) (512z: 0) +TOTAL : 1.400732 sec + 4,075,771,818 cycles # 2.902 GHz + 9,331,333,269 instructions # 2.29 insn per cycle + 1.406221756 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:83766) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -159,15 +159,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.200270e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.201041e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.201041e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.155112e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.155865e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.155865e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 1.263149 sec - 3,638,579,348 cycles # 2.873 GHz - 8,308,899,095 instructions # 2.28 insn per cycle - 1.267932232 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:83565) (512y: 26) (512z: 0) +TOTAL : 1.276847 sec + 3,682,881,798 cycles # 2.877 GHz + 8,307,430,129 instructions # 2.26 insn per cycle + 1.281346988 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:83502) (512y: 20) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -185,15 +185,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.832765e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.833558e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.833558e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.785211e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.786021e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.786021e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 1.384479 sec - 2,644,551,961 cycles # 1.905 GHz - 4,234,709,426 instructions # 1.60 insn per cycle - 1.389593845 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1939) (512y: 81) (512z:82720) +TOTAL : 1.403459 sec + 2,639,508,347 cycles # 1.878 GHz + 4,234,654,161 instructions # 1.60 insn per cycle + 1.408336015 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1911) (512y: 57) (512z:82637) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt index 71a21a77b1..cd996fa793 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt @@ -35,7 +35,7 @@ CUDACPP_BUILDDIR='build.512z_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2023-07-20_21:21:53 +DATE: 2023-06-16_23:06:07 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -43,14 +43,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProce Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.681907e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.682458e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.682592e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.688054e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.688638e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.688761e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 2.199746 sec - 7,609,717,480 cycles # 3.011 GHz - 17,083,094,068 instructions # 2.24 insn per cycle - 2.639012525 seconds time elapsed +TOTAL : 2.196300 sec + 7,637,726,731 cycles # 3.023 GHz + 15,838,474,776 instructions # 2.07 insn per cycle + 2.583156463 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/gcheck.exe -p 1 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -59,14 +59,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProce Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.110698e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.111034e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.111062e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.108172e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.108543e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.108568e+04 ) sec^-1 MeanMatrixElemValue = ( 1.856249e-04 +- 8.329951e-05 ) GeV^-6 -TOTAL : 3.434864 sec - 11,402,455,300 cycles # 3.023 GHz - 26,904,883,245 instructions # 2.36 insn per cycle - 3.831892543 seconds time elapsed +TOTAL : 3.433735 sec + 11,452,674,028 cycles # 3.032 GHz + 23,961,349,118 instructions # 2.09 insn per cycle + 3.834644270 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 @@ -81,15 +81,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.404569e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.404871e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.404871e+01 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.781184e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.781448e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.781448e+01 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 6.287903 sec - 19,376,496,452 cycles # 3.081 GHz - 54,297,162,234 instructions # 2.80 insn per cycle - 6.292874360 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:32419) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.780445 sec + 19,564,693,523 cycles # 2.884 GHz + 54,298,011,506 instructions # 2.78 insn per cycle + 6.785104515 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:32420) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -107,15 +107,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.625790e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.625906e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.625906e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.707967e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.708090e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.708090e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 3.254806 sec - 9,507,190,981 cycles # 2.918 GHz - 26,032,026,508 instructions # 2.74 insn per cycle - 3.266272928 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:95843) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.103057 sec + 9,582,705,373 cycles # 3.085 GHz + 26,028,889,965 instructions # 2.72 insn per cycle + 3.107702792 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:95694) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -133,15 +133,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.709752e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.710344e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.710344e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.703281e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.703858e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.703858e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 1.429277 sec - 4,147,525,107 cycles # 2.894 GHz - 9,313,363,723 instructions # 2.25 insn per cycle - 1.434420166 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:83724) (512y: 0) (512z: 0) +TOTAL : 1.431633 sec + 4,120,363,711 cycles # 2.872 GHz + 9,310,908,558 instructions # 2.26 insn per cycle + 1.437026482 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:83565) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -159,15 +159,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.212792e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.213581e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.213581e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.240493e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.241279e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.241279e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 1.259433 sec - 3,624,392,166 cycles # 2.872 GHz - 8,305,846,892 instructions # 2.29 insn per cycle - 1.264021539 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:83253) (512y: 198) (512z: 0) +TOTAL : 1.251163 sec + 3,616,491,450 cycles # 2.883 GHz + 8,302,817,812 instructions # 2.30 insn per cycle + 1.256135099 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:83185) (512y: 170) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -185,15 +185,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.755162e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.755981e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.755981e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.753912e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.754634e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.754634e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 1.413162 sec - 2,643,285,165 cycles # 1.865 GHz - 4,234,087,849 instructions # 1.60 insn per cycle - 1.418311507 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1469) (512y: 176) (512z:82807) +TOTAL : 1.412974 sec + 2,639,045,649 cycles # 1.864 GHz + 4,230,988,059 instructions # 1.60 insn per cycle + 1.417985866 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1428) (512y: 156) (512z:82779) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt index a8723f1746..ac1881c14d 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt @@ -35,7 +35,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2023-07-20_21:15:03 +DATE: 2023-06-16_22:59:46 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -43,14 +43,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcess Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.475993e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.447057e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.962495e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.470677e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.403049e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.845489e+07 ) sec^-1 MeanMatrixElemValue = ( 3.404831e+01 +- 1.677228e+01 ) GeV^-2 -TOTAL : 0.490870 sec - 1,979,741,973 cycles # 2.832 GHz - 2,429,702,136 instructions # 1.23 insn per cycle - 0.871955192 seconds time elapsed +TOTAL : 0.475414 sec + 2,007,563,669 cycles # 2.838 GHz + 2,452,577,465 instructions # 1.22 insn per cycle + 0.764443076 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -59,14 +59,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcess Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.984004e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.506003e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.086318e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.377326e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.547761e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.992085e+07 ) sec^-1 MeanMatrixElemValue = ( 4.282445e+02 +- 2.530899e+02 ) GeV^-2 -TOTAL : 0.564934 sec - 2,377,875,653 cycles # 2.903 GHz - 2,942,981,994 instructions # 1.24 insn per cycle - 0.876669746 seconds time elapsed +TOTAL : 0.557136 sec + 2,348,204,579 cycles # 2.922 GHz + 2,941,788,306 instructions # 1.25 insn per cycle + 0.862619515 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 @@ -81,15 +81,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.176571e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.209757e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.209757e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.175937e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.209277e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.209277e+05 ) sec^-1 MeanMatrixElemValue = ( 3.404831e+01 +- 1.677228e+01 ) GeV^-2 -TOTAL : 1.415464 sec - 4,410,193,194 cycles # 3.106 GHz - 12,854,177,072 instructions # 2.91 insn per cycle - 1.440655470 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 732) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 1.417777 sec + 4,418,078,401 cycles # 3.108 GHz + 12,858,365,147 instructions # 2.91 insn per cycle + 1.423319552 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 733) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -107,15 +107,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.109253e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.220024e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.220024e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.113749e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.225511e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.225511e+05 ) sec^-1 MeanMatrixElemValue = ( 3.404831e+01 +- 1.677228e+01 ) GeV^-2 -TOTAL : 0.799194 sec - 2,473,486,754 cycles # 3.078 GHz - 7,162,552,492 instructions # 2.90 insn per cycle - 0.882082300 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 3139) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 0.797724 sec + 2,444,856,977 cycles # 3.051 GHz + 7,068,441,253 instructions # 2.89 insn per cycle + 0.808817770 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 3093) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -133,15 +133,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.736766e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.112272e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.112272e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.944151e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.349910e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.349910e+05 ) sec^-1 MeanMatrixElemValue = ( 3.404831e+01 +- 1.677228e+01 ) GeV^-2 -TOTAL : 0.464718 sec - 1,289,576,049 cycles # 2.750 GHz - 2,873,597,932 instructions # 2.23 insn per cycle - 0.552140330 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2775) (512y: 0) (512z: 0) +TOTAL : 0.438420 sec + 1,274,694,276 cycles # 2.885 GHz + 2,842,409,497 instructions # 2.23 insn per cycle + 0.443229097 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2725) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -159,15 +159,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.238627e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.695780e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.695780e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.355894e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.841805e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.841805e+05 ) sec^-1 MeanMatrixElemValue = ( 3.404831e+01 +- 1.677228e+01 ) GeV^-2 -TOTAL : 0.410605 sec - 1,191,207,791 cycles # 2.867 GHz - 2,738,402,015 instructions # 2.30 insn per cycle - 0.462928839 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2582) (512y: 63) (512z: 0) +TOTAL : 0.399144 sec + 1,170,164,950 cycles # 2.906 GHz + 2,702,440,747 instructions # 2.31 insn per cycle + 0.403916574 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2530) (512y: 54) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -185,15 +185,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.948924e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.168572e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.168572e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.020461e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.251605e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.251605e+05 ) sec^-1 MeanMatrixElemValue = ( 3.404831e+01 +- 1.677228e+01 ) GeV^-2 -TOTAL : 0.580947 sec - 1,157,982,735 cycles # 1.979 GHz - 1,716,553,704 instructions # 1.48 insn per cycle - 0.630989257 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1093) (512y: 101) (512z: 2206) +TOTAL : 0.567519 sec + 1,146,857,475 cycles # 2.007 GHz + 1,675,706,744 instructions # 1.46 insn per cycle + 0.572452940 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1055) (512y: 78) (512z: 2135) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt index be2ffa1be2..9d2980d703 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt @@ -35,7 +35,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2023-07-20_21:54:43 +DATE: 2023-06-16_23:21:34 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -47,14 +47,14 @@ WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.016242e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.169657e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.169657e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.070393e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.287586e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.287586e+07 ) sec^-1 MeanMatrixElemValue = ( 3.404831e+01 +- 1.677228e+01 ) GeV^-2 -TOTAL : 0.500841 sec - 2,114,404,612 cycles # 2.933 GHz - 2,728,346,386 instructions # 1.29 insn per cycle - 0.778308754 seconds time elapsed +TOTAL : 0.504575 sec + 2,103,581,583 cycles # 2.905 GHz + 2,735,769,174 instructions # 1.30 insn per cycle + 0.783099451 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost @@ -71,14 +71,14 @@ WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublo Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.770810e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.283901e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.283901e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.869815e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.409855e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.409855e+07 ) sec^-1 MeanMatrixElemValue = ( 4.282445e+02 +- 2.530899e+02 ) GeV^-2 -TOTAL : 0.811386 sec - 3,128,447,694 cycles # 2.917 GHz - 4,396,805,322 instructions # 1.41 insn per cycle - 1.130790211 seconds time elapsed +TOTAL : 0.796207 sec + 3,159,417,542 cycles # 2.977 GHz + 4,392,482,036 instructions # 1.39 insn per cycle + 1.122081608 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 @@ -94,15 +94,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.162945e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.195922e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.195922e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.166207e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.199063e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.199063e+05 ) sec^-1 MeanMatrixElemValue = ( 3.404831e+01 +- 1.677228e+01 ) GeV^-2 -TOTAL : 1.437636 sec - 4,440,949,887 cycles # 3.080 GHz - 12,861,592,934 instructions # 2.90 insn per cycle - 1.443053914 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 732) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 1.434034 sec + 4,450,678,360 cycles # 3.095 GHz + 12,865,964,603 instructions # 2.89 insn per cycle + 1.439725535 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 733) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -121,15 +121,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.103515e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.214081e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.214081e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.121625e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.232697e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.232697e+05 ) sec^-1 MeanMatrixElemValue = ( 3.404831e+01 +- 1.677228e+01 ) GeV^-2 -TOTAL : 0.807386 sec - 2,506,001,847 cycles # 3.085 GHz - 7,210,030,075 instructions # 2.88 insn per cycle - 0.818718442 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 3139) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 0.801253 sec + 2,477,560,136 cycles # 3.079 GHz + 7,119,440,241 instructions # 2.87 insn per cycle + 0.812534403 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 3093) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -148,15 +148,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.925163e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.328295e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.328295e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.934946e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.330898e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.330898e+05 ) sec^-1 MeanMatrixElemValue = ( 3.404831e+01 +- 1.677228e+01 ) GeV^-2 -TOTAL : 0.448334 sec - 1,317,255,871 cycles # 2.908 GHz - 2,923,967,899 instructions # 2.22 insn per cycle - 0.453993650 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2775) (512y: 0) (512z: 0) +TOTAL : 0.446709 sec + 1,309,681,066 cycles # 2.901 GHz + 2,895,588,054 instructions # 2.21 insn per cycle + 0.458145800 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2725) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -175,15 +175,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.150300e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.612005e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.612005e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.311032e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.799335e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.799335e+05 ) sec^-1 MeanMatrixElemValue = ( 3.404831e+01 +- 1.677228e+01 ) GeV^-2 -TOTAL : 0.426633 sec - 1,225,350,532 cycles # 2.840 GHz - 2,788,857,146 instructions # 2.28 insn per cycle - 0.432491653 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2582) (512y: 63) (512z: 0) +TOTAL : 0.409974 sec + 1,200,991,662 cycles # 2.902 GHz + 2,753,820,763 instructions # 2.29 insn per cycle + 0.420684326 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2530) (512y: 54) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -202,15 +202,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.973972e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.199392e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.199392e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.052439e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.289835e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.289835e+05 ) sec^-1 MeanMatrixElemValue = ( 3.404831e+01 +- 1.677228e+01 ) GeV^-2 -TOTAL : 0.583569 sec - 1,192,886,058 cycles # 2.029 GHz - 1,757,745,493 instructions # 1.47 insn per cycle - 0.589190247 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1093) (512y: 101) (512z: 2206) +TOTAL : 0.568236 sec + 1,177,198,332 cycles # 2.056 GHz + 1,715,837,168 instructions # 1.46 insn per cycle + 0.573431193 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1055) (512y: 78) (512z: 2135) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt index f6ed20d46a..0ac8c3da88 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt @@ -35,7 +35,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2023-07-20_21:15:22 +DATE: 2023-06-16_23:00:03 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -43,14 +43,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcess Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.415465e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.242335e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.652983e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.441903e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.284655e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.707498e+07 ) sec^-1 MeanMatrixElemValue = ( 3.404831e+01 +- 1.677228e+01 ) GeV^-2 -TOTAL : 0.472950 sec - 2,049,277,506 cycles # 2.902 GHz - 2,477,464,277 instructions # 1.21 insn per cycle - 0.976431723 seconds time elapsed +TOTAL : 0.476149 sec + 2,004,075,040 cycles # 2.858 GHz + 2,447,450,357 instructions # 1.22 insn per cycle + 0.758991361 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -59,14 +59,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcess Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.960051e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.376444e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.935863e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.358848e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.448514e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.881423e+07 ) sec^-1 MeanMatrixElemValue = ( 4.282445e+02 +- 2.530899e+02 ) GeV^-2 -TOTAL : 0.570788 sec - 2,355,507,793 cycles # 2.868 GHz - 2,938,291,470 instructions # 1.25 insn per cycle - 0.880416599 seconds time elapsed +TOTAL : 0.560373 sec + 2,330,695,105 cycles # 2.896 GHz + 2,941,791,611 instructions # 1.26 insn per cycle + 0.864662823 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 @@ -81,14 +81,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.179449e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.213524e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.213524e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.185072e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.219142e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.219142e+05 ) sec^-1 MeanMatrixElemValue = ( 3.404831e+01 +- 1.677228e+01 ) GeV^-2 -TOTAL : 1.412496 sec - 4,367,571,047 cycles # 3.082 GHz - 12,734,711,583 instructions # 2.92 insn per cycle - 1.493072004 seconds time elapsed +TOTAL : 1.405955 sec + 4,365,481,367 cycles # 3.097 GHz + 12,734,304,560 instructions # 2.92 insn per cycle + 1.410932077 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 687) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/runTest.exe @@ -107,15 +107,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.150145e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.266713e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.266713e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.160833e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.277513e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.277513e+05 ) sec^-1 MeanMatrixElemValue = ( 3.404831e+01 +- 1.677228e+01 ) GeV^-2 -TOTAL : 0.816456 sec - 2,422,016,689 cycles # 3.065 GHz - 7,010,607,534 instructions # 2.89 insn per cycle - 1.067036767 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2963) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 0.781135 sec + 2,409,696,110 cycles # 3.070 GHz + 6,927,148,000 instructions # 2.87 insn per cycle + 0.792294841 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2942) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -133,15 +133,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.559343e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.882671e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.882671e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.625524e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.958602e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.958602e+05 ) sec^-1 MeanMatrixElemValue = ( 3.404831e+01 +- 1.677228e+01 ) GeV^-2 -TOTAL : 0.490308 sec - 1,416,701,105 cycles # 2.896 GHz - 3,136,870,946 instructions # 2.21 insn per cycle - 0.620584526 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2967) (512y: 0) (512z: 0) +TOTAL : 0.474614 sec + 1,386,813,698 cycles # 2.899 GHz + 3,036,853,128 instructions # 2.19 insn per cycle + 0.484841120 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2831) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -159,15 +159,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.762633e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.117403e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.117403e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.841557e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.216736e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.216736e+05 ) sec^-1 MeanMatrixElemValue = ( 3.404831e+01 +- 1.677228e+01 ) GeV^-2 -TOTAL : 0.459205 sec - 1,342,332,785 cycles # 2.894 GHz - 3,025,285,714 instructions # 2.25 insn per cycle - 0.574037232 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2681) (512y: 238) (512z: 0) +TOTAL : 0.449897 sec + 1,316,149,525 cycles # 2.902 GHz + 2,929,799,564 instructions # 2.23 insn per cycle + 0.464962802 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2576) (512y: 207) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -185,15 +185,15 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.872342e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.076057e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.076057e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.868510e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.078585e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.078585e+05 ) sec^-1 MeanMatrixElemValue = ( 3.404831e+01 +- 1.677228e+01 ) GeV^-2 -TOTAL : 0.595276 sec - 1,205,877,448 cycles # 2.011 GHz - 1,991,853,155 instructions # 1.65 insn per cycle - 0.666783809 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1035) (512y: 205) (512z: 2369) +TOTAL : 0.597358 sec + 1,194,467,791 cycles # 1.987 GHz + 1,906,072,402 instructions # 1.60 insn per cycle + 0.602593356 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1002) (512y: 185) (512z: 2242) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt index e46f7cd12d..85404bb68d 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt @@ -35,7 +35,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2023-07-20_21:15:41 +DATE: 2023-06-16_23:00:20 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -43,14 +43,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcess Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.876951e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.204900e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.377247e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.984648e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.208639e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.373355e+08 ) sec^-1 MeanMatrixElemValue = ( 3.402886e+01 +- 1.677500e+01 ) GeV^-2 -TOTAL : 0.467882 sec - 2,003,644,749 cycles # 2.893 GHz - 2,459,578,007 instructions # 1.23 insn per cycle - 1.245327476 seconds time elapsed +TOTAL : 0.467463 sec + 1,994,268,934 cycles # 2.883 GHz + 2,406,150,579 instructions # 1.21 insn per cycle + 0.748794310 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 168 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -59,14 +59,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcess Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.375117e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.792250e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.970690e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.108799e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.846646e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.967345e+08 ) sec^-1 MeanMatrixElemValue = ( 4.166198e+02 +- 2.517590e+02 ) GeV^-2 -TOTAL : 0.510956 sec - 2,156,092,802 cycles # 2.888 GHz - 2,671,738,628 instructions # 1.24 insn per cycle - 0.806080753 seconds time elapsed +TOTAL : 0.507796 sec + 2,130,658,759 cycles # 2.870 GHz + 2,630,610,519 instructions # 1.23 insn per cycle + 0.801554505 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 @@ -81,15 +81,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.203334e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.232785e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.232785e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.199190e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.228570e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.228570e+05 ) sec^-1 MeanMatrixElemValue = ( 3.422773e+01 +- 1.683421e+01 ) GeV^-2 -TOTAL : 1.384382 sec - 4,264,358,949 cycles # 3.073 GHz - 12,773,106,830 instructions # 3.00 insn per cycle - 1.448002651 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 697) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 1.388677 sec + 4,265,371,799 cycles # 3.066 GHz + 12,765,139,156 instructions # 2.99 insn per cycle + 1.393497705 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 701) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -107,15 +107,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.352664e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.594332e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.594332e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.419636e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.674540e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.674540e+05 ) sec^-1 MeanMatrixElemValue = ( 3.422772e+01 +- 1.683421e+01 ) GeV^-2 -TOTAL : 0.530919 sec - 1,578,395,744 cycles # 3.067 GHz - 4,202,122,413 instructions # 2.66 insn per cycle - 0.772445427 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 3730) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 0.499936 sec + 1,537,781,005 cycles # 3.051 GHz + 4,119,226,953 instructions # 2.68 insn per cycle + 0.510812610 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 3693) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -133,15 +133,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.205419e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.354472e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.354472e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.327767e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.544279e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.544279e+05 ) sec^-1 MeanMatrixElemValue = ( 3.422183e+01 +- 1.683665e+01 ) GeV^-2 -TOTAL : 0.248710 sec - 738,580,552 cycles # 2.918 GHz - 1,688,548,724 instructions # 2.29 insn per cycle - 0.292339947 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3193) (512y: 0) (512z: 0) +TOTAL : 0.245130 sec + 718,016,333 cycles # 2.885 GHz + 1,661,722,651 instructions # 2.31 insn per cycle + 0.255967125 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3109) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -159,15 +159,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.444748e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.699668e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.699668e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.836550e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.271069e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.271069e+05 ) sec^-1 MeanMatrixElemValue = ( 3.422183e+01 +- 1.683665e+01 ) GeV^-2 -TOTAL : 0.349259 sec - 694,206,269 cycles # 2.811 GHz - 1,616,284,371 instructions # 2.33 insn per cycle - 0.439825863 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3026) (512y: 15) (512z: 0) +TOTAL : 0.230305 sec + 678,464,193 cycles # 2.902 GHz + 1,594,224,390 instructions # 2.35 insn per cycle + 0.240921400 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2954) (512y: 14) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -185,15 +185,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.594127e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.280888e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.280888e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.868158e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.618302e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.618302e+05 ) sec^-1 MeanMatrixElemValue = ( 3.422183e+01 +- 1.683665e+01 ) GeV^-2 -TOTAL : 0.316304 sec - 663,204,966 cycles # 2.069 GHz - 1,100,515,454 instructions # 1.66 insn per cycle - 0.354378141 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1593) (512y: 44) (512z: 2363) +TOTAL : 0.302538 sec + 641,516,414 cycles # 2.091 GHz + 1,069,291,782 instructions # 1.67 insn per cycle + 0.314821042 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1530) (512y: 33) (512z: 2277) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt index f339615766..0cb2b651aa 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt @@ -35,7 +35,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2023-07-20_21:55:00 +DATE: 2023-06-16_23:21:51 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -47,14 +47,14 @@ WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.429664e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.074753e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.074753e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.619996e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.273991e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.273991e+07 ) sec^-1 MeanMatrixElemValue = ( 3.419752e+01 +- 1.682900e+01 ) GeV^-2 -TOTAL : 0.478583 sec - 2,040,188,226 cycles # 2.873 GHz - 2,589,417,415 instructions # 1.27 insn per cycle - 0.767753461 seconds time elapsed +TOTAL : 0.476940 sec + 2,047,260,675 cycles # 2.896 GHz + 2,612,418,216 instructions # 1.28 insn per cycle + 0.764236233 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost @@ -71,14 +71,14 @@ WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublo Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.025340e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.673732e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.673732e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.197562e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.898248e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.898248e+07 ) sec^-1 MeanMatrixElemValue = ( 4.349381e+02 +- 2.541442e+02 ) GeV^-2 -TOTAL : 0.650539 sec - 2,593,587,632 cycles # 2.916 GHz - 3,558,500,249 instructions # 1.37 insn per cycle - 0.948675370 seconds time elapsed +TOTAL : 0.646628 sec + 2,586,558,663 cycles # 2.919 GHz + 3,548,806,657 instructions # 1.37 insn per cycle + 0.945018609 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 @@ -94,15 +94,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.212985e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.242892e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.242892e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.208772e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.238538e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.238538e+05 ) sec^-1 MeanMatrixElemValue = ( 3.422773e+01 +- 1.683421e+01 ) GeV^-2 -TOTAL : 1.375663 sec - 4,275,613,986 cycles # 3.100 GHz - 12,777,760,633 instructions # 2.99 insn per cycle - 1.381142304 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 697) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 1.380519 sec + 4,277,155,433 cycles # 3.092 GHz + 12,769,582,721 instructions # 2.99 insn per cycle + 1.385373229 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 701) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -121,15 +121,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.326134e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.572745e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.572745e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.430847e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.687591e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.687591e+05 ) sec^-1 MeanMatrixElemValue = ( 3.422772e+01 +- 1.683421e+01 ) GeV^-2 -TOTAL : 0.517533 sec - 1,597,216,466 cycles # 3.065 GHz - 4,249,723,492 instructions # 2.66 insn per cycle - 0.531834962 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 3730) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 0.502214 sec + 1,557,572,574 cycles # 3.074 GHz + 4,167,570,923 instructions # 2.68 insn per cycle + 0.514108531 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 3693) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -148,15 +148,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.080671e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.214068e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.214068e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.366004e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.582913e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.582913e+05 ) sec^-1 MeanMatrixElemValue = ( 3.422183e+01 +- 1.683665e+01 ) GeV^-2 -TOTAL : 0.256373 sec - 757,632,904 cycles # 2.913 GHz - 1,724,960,904 instructions # 2.28 insn per cycle - 0.261236709 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3193) (512y: 0) (512z: 0) +TOTAL : 0.247132 sec + 735,373,188 cycles # 2.931 GHz + 1,698,741,216 instructions # 2.31 insn per cycle + 0.251948722 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3109) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -175,15 +175,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.680840e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.012936e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.012936e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.814926e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.180518e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.180518e+05 ) sec^-1 MeanMatrixElemValue = ( 3.422183e+01 +- 1.683665e+01 ) GeV^-2 -TOTAL : 0.238248 sec - 707,237,931 cycles # 2.916 GHz - 1,652,694,649 instructions # 2.34 insn per cycle - 0.243543396 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3026) (512y: 15) (512z: 0) +TOTAL : 0.234804 sec + 694,783,748 cycles # 2.909 GHz + 1,631,382,774 instructions # 2.35 insn per cycle + 0.239954292 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2954) (512y: 14) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -202,15 +202,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.622658e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.300085e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.300085e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.811317e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.536250e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.536250e+05 ) sec^-1 MeanMatrixElemValue = ( 3.422183e+01 +- 1.683665e+01 ) GeV^-2 -TOTAL : 0.318841 sec - 682,441,317 cycles # 2.112 GHz - 1,141,535,767 instructions # 1.67 insn per cycle - 0.324263151 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1593) (512y: 44) (512z: 2363) +TOTAL : 0.309567 sec + 660,659,148 cycles # 2.110 GHz + 1,111,031,208 instructions # 1.68 insn per cycle + 0.314367440 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1530) (512y: 33) (512z: 2277) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt index 206bd8cef9..1ed4c388b7 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt @@ -35,7 +35,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2023-07-20_21:15:59 +DATE: 2023-06-16_23:00:36 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -43,30 +43,30 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcess Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.833296e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.177467e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.357200e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.864221e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.200538e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.353895e+08 ) sec^-1 MeanMatrixElemValue = ( 3.402886e+01 +- 1.677500e+01 ) GeV^-2 -TOTAL : 0.466083 sec - 2,007,376,484 cycles # 2.914 GHz - 2,418,083,664 instructions # 1.20 insn per cycle - 0.912641196 seconds time elapsed +TOTAL : 0.468538 sec + 1,986,332,099 cycles # 2.873 GHz + 2,412,092,912 instructions # 1.21 insn per cycle + 0.749649403 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 1 -==PROF== Profiling "sigmaKin": launch__registers_per_thread 162 +==PROF== Profiling "sigmaKin": launch__registers_per_thread 161 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.303926e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.784234e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.958728e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.066200e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.816896e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.937623e+08 ) sec^-1 MeanMatrixElemValue = ( 4.166198e+02 +- 2.517590e+02 ) GeV^-2 -TOTAL : 0.510629 sec - 2,184,124,530 cycles # 2.916 GHz - 2,654,043,166 instructions # 1.22 insn per cycle - 0.806756219 seconds time elapsed +TOTAL : 0.506737 sec + 2,138,564,502 cycles # 2.876 GHz + 2,655,004,925 instructions # 1.24 insn per cycle + 0.801248487 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 @@ -81,14 +81,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.218276e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.247948e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.247948e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.218143e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.248221e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.248221e+05 ) sec^-1 MeanMatrixElemValue = ( 3.422773e+01 +- 1.683421e+01 ) GeV^-2 -TOTAL : 1.365746 sec - 4,229,222,706 cycles # 3.087 GHz - 12,672,475,084 instructions # 3.00 insn per cycle - 1.393065483 seconds time elapsed +TOTAL : 1.366158 sec + 4,229,850,435 cycles # 3.088 GHz + 12,672,250,194 instructions # 3.00 insn per cycle + 1.370859371 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 648) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/runTest.exe @@ -107,15 +107,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.708772e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.004689e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.004689e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.810631e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.132301e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.132301e+05 ) sec^-1 MeanMatrixElemValue = ( 3.422772e+01 +- 1.683421e+01 ) GeV^-2 -TOTAL : 0.462124 sec - 1,434,605,054 cycles # 3.072 GHz - 4,078,202,850 instructions # 2.84 insn per cycle - 0.577421325 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 3461) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 0.450376 sec + 1,402,013,020 cycles # 3.084 GHz + 4,014,545,759 instructions # 2.86 insn per cycle + 0.461135794 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 3449) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -133,15 +133,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.312600e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.914718e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.914718e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.649698e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.339892e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.339892e+05 ) sec^-1 MeanMatrixElemValue = ( 3.422183e+01 +- 1.683665e+01 ) GeV^-2 -TOTAL : 0.360829 sec - 970,775,497 cycles # 2.904 GHz - 2,016,615,650 instructions # 2.08 insn per cycle - 0.912368281 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3908) (512y: 0) (512z: 0) +TOTAL : 0.310950 sec + 917,414,969 cycles # 2.911 GHz + 1,944,404,967 instructions # 2.12 insn per cycle + 0.321816236 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3708) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -159,15 +159,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.553413e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.198546e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.198546e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.731171e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.478338e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.478338e+05 ) sec^-1 MeanMatrixElemValue = ( 3.422183e+01 +- 1.683665e+01 ) GeV^-2 -TOTAL : 0.316325 sec - 937,465,425 cycles # 2.920 GHz - 1,947,933,253 instructions # 2.08 insn per cycle - 0.726128836 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3766) (512y: 10) (512z: 0) +TOTAL : 0.306986 sec + 888,396,059 cycles # 2.858 GHz + 1,866,890,226 instructions # 2.10 insn per cycle + 0.318202607 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3561) (512y: 12) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -185,15 +185,15 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.977017e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.316722e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.316722e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.528801e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.968883e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.968883e+05 ) sec^-1 MeanMatrixElemValue = ( 3.422183e+01 +- 1.683665e+01 ) GeV^-2 -TOTAL : 0.436206 sec - 845,165,820 cycles # 1.915 GHz - 1,472,239,559 instructions # 1.74 insn per cycle - 0.714561373 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2260) (512y: 24) (512z: 2944) +TOTAL : 0.384950 sec + 796,333,894 cycles # 2.047 GHz + 1,364,853,040 instructions # 1.71 insn per cycle + 0.390179000 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2080) (512y: 25) (512z: 2631) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt index 0321169d45..a86220883d 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt @@ -35,7 +35,7 @@ CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2023-07-20_21:16:18 +DATE: 2023-06-16_23:00:52 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -43,14 +43,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcess Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.443658e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.367966e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.827577e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.481426e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.487957e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.943892e+07 ) sec^-1 MeanMatrixElemValue = ( 3.404831e+01 +- 1.677228e+01 ) GeV^-2 -TOTAL : 0.473331 sec - 2,041,533,647 cycles # 2.891 GHz - 2,459,961,698 instructions # 1.20 insn per cycle - 1.179787154 seconds time elapsed +TOTAL : 0.474499 sec + 2,055,268,717 cycles # 2.894 GHz + 2,489,636,002 instructions # 1.21 insn per cycle + 0.767461965 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -59,14 +59,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcess Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.975045e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.462268e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.042860e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.380889e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.587624e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.036549e+07 ) sec^-1 MeanMatrixElemValue = ( 4.282445e+02 +- 2.530899e+02 ) GeV^-2 -TOTAL : 0.569403 sec - 2,382,562,928 cycles # 2.899 GHz - 2,955,376,317 instructions # 1.24 insn per cycle - 0.879909929 seconds time elapsed +TOTAL : 0.556289 sec + 2,350,861,575 cycles # 2.922 GHz + 2,945,489,310 instructions # 1.25 insn per cycle + 0.862355038 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 @@ -81,15 +81,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.147655e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.180074e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.180074e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.161099e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.194017e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.194017e+05 ) sec^-1 MeanMatrixElemValue = ( 3.404831e+01 +- 1.677228e+01 ) GeV^-2 -TOTAL : 1.451033 sec - 4,441,834,390 cycles # 3.054 GHz - 12,825,895,656 instructions # 2.89 insn per cycle - 1.485153834 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 707) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 1.441531 sec + 4,441,671,913 cycles # 3.079 GHz + 12,830,518,319 instructions # 2.89 insn per cycle + 1.448679311 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 708) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -107,15 +107,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.121773e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.233820e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.233820e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.046596e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.158375e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.158375e+05 ) sec^-1 MeanMatrixElemValue = ( 3.404831e+01 +- 1.677228e+01 ) GeV^-2 -TOTAL : 0.819219 sec - 2,478,541,180 cycles # 3.096 GHz - 7,076,448,533 instructions # 2.86 insn per cycle - 0.885222654 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 3198) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 0.825457 sec + 2,446,304,474 cycles # 2.949 GHz + 6,976,512,368 instructions # 2.85 insn per cycle + 0.837405524 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 3144) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -133,15 +133,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.023911e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.442523e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.442523e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.998918e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.420645e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.420645e+05 ) sec^-1 MeanMatrixElemValue = ( 3.404831e+01 +- 1.677228e+01 ) GeV^-2 -TOTAL : 0.431325 sec - 1,269,223,798 cycles # 2.909 GHz - 2,843,754,763 instructions # 2.24 insn per cycle - 0.765797639 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2928) (512y: 0) (512z: 0) +TOTAL : 0.433627 sec + 1,261,017,634 cycles # 2.881 GHz + 2,809,295,715 instructions # 2.23 insn per cycle + 0.438687115 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2872) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -159,15 +159,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.414267e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.918242e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.918242e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.490422e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.009169e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.009169e+05 ) sec^-1 MeanMatrixElemValue = ( 3.404831e+01 +- 1.677228e+01 ) GeV^-2 -TOTAL : 0.395051 sec - 1,161,495,236 cycles # 2.905 GHz - 2,701,623,879 instructions # 2.33 insn per cycle - 0.499917445 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2727) (512y: 69) (512z: 0) +TOTAL : 0.388524 sec + 1,143,580,715 cycles # 2.915 GHz + 2,667,697,890 instructions # 2.33 insn per cycle + 0.393399213 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2679) (512y: 60) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -185,15 +185,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.867702e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.074723e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.074723e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.897672e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.110284e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.110284e+05 ) sec^-1 MeanMatrixElemValue = ( 3.404831e+01 +- 1.677228e+01 ) GeV^-2 -TOTAL : 0.597442 sec - 1,200,394,667 cycles # 1.993 GHz - 1,749,052,577 instructions # 1.46 insn per cycle - 0.663402708 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1475) (512y: 109) (512z: 2250) +TOTAL : 0.591572 sec + 1,191,668,768 cycles # 2.001 GHz + 1,710,006,963 instructions # 1.43 insn per cycle + 0.596717394 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1441) (512y: 85) (512z: 2182) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt index d0951d97b5..d5360b871e 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt @@ -35,7 +35,7 @@ CUDACPP_BUILDDIR='build.512z_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2023-07-20_21:16:37 +DATE: 2023-06-16_23:01:09 On itscrd80.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -43,14 +43,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcess Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.426081e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.292116e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.711696e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.438523e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.296220e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.712443e+07 ) sec^-1 MeanMatrixElemValue = ( 3.404831e+01 +- 1.677228e+01 ) GeV^-2 -TOTAL : 0.472038 sec - 2,071,948,490 cycles # 2.921 GHz - 2,501,045,175 instructions # 1.21 insn per cycle - 0.839388202 seconds time elapsed +TOTAL : 0.473737 sec + 2,022,437,137 cycles # 2.902 GHz + 2,442,479,633 instructions # 1.21 insn per cycle + 0.756210762 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 1 ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% @@ -59,14 +59,14 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcess Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.2.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.956038e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.349090e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.901687e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.355982e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.498538e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.934566e+07 ) sec^-1 MeanMatrixElemValue = ( 4.282445e+02 +- 2.530899e+02 ) GeV^-2 -TOTAL : 0.562732 sec - 2,439,467,564 cycles # 2.946 GHz - 2,979,072,440 instructions # 1.22 insn per cycle - 0.886051781 seconds time elapsed +TOTAL : 0.556362 sec + 2,332,917,861 cycles # 2.900 GHz + 2,935,857,970 instructions # 1.26 insn per cycle + 0.862169599 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 @@ -81,14 +81,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.170665e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.204019e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.204019e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.170795e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.204914e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.204914e+05 ) sec^-1 MeanMatrixElemValue = ( 3.404831e+01 +- 1.677228e+01 ) GeV^-2 -TOTAL : 1.422770 sec - 4,390,372,476 cycles # 3.078 GHz - 12,709,187,143 instructions # 2.89 insn per cycle - 1.464667010 seconds time elapsed +TOTAL : 1.422352 sec + 4,386,288,323 cycles # 3.077 GHz + 12,708,814,441 instructions # 2.90 insn per cycle + 1.427223028 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 659) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/runTest.exe @@ -107,15 +107,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.161076e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.278375e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.278375e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.148376e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.266083e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.266083e+05 ) sec^-1 MeanMatrixElemValue = ( 3.404831e+01 +- 1.677228e+01 ) GeV^-2 -TOTAL : 0.780956 sec - 2,413,884,473 cycles # 3.072 GHz - 6,854,388,052 instructions # 2.84 insn per cycle - 0.882702669 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 3023) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 0.786032 sec + 2,382,887,401 cycles # 3.016 GHz + 6,777,982,529 instructions # 2.84 insn per cycle + 0.796666620 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 3010) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -133,15 +133,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.584339e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.912145e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.912145e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.604898e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.935937e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.935937e+05 ) sec^-1 MeanMatrixElemValue = ( 3.404831e+01 +- 1.677228e+01 ) GeV^-2 -TOTAL : 0.480541 sec - 1,416,941,029 cycles # 2.921 GHz - 3,077,847,324 instructions # 2.17 insn per cycle - 0.540344645 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3126) (512y: 0) (512z: 0) +TOTAL : 0.478024 sec + 1,400,367,251 cycles # 2.905 GHz + 2,987,020,671 instructions # 2.13 insn per cycle + 0.488917156 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3010) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -159,15 +159,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.795490e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.154345e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.154345e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.857659e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.247357e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.247357e+05 ) sec^-1 MeanMatrixElemValue = ( 3.404831e+01 +- 1.677228e+01 ) GeV^-2 -TOTAL : 0.454238 sec - 1,331,527,898 cycles # 2.903 GHz - 2,965,445,899 instructions # 2.23 insn per cycle - 0.495622366 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2834) (512y: 250) (512z: 0) +TOTAL : 0.448265 sec + 1,311,795,258 cycles # 2.900 GHz + 2,871,509,338 instructions # 2.19 insn per cycle + 0.453504742 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2738) (512y: 216) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -185,15 +185,15 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.832442e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.031511e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.031511e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.874954e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.085752e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.085752e+05 ) sec^-1 MeanMatrixElemValue = ( 3.404831e+01 +- 1.677228e+01 ) GeV^-2 -TOTAL : 0.603751 sec - 1,221,857,414 cycles # 2.010 GHz - 1,946,814,737 instructions # 1.59 insn per cycle - 0.640910959 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1381) (512y: 212) (512z: 2435) +TOTAL : 0.594897 sec + 1,202,081,331 cycles # 2.008 GHz + 1,864,012,772 instructions # 1.55 insn per cycle + 0.599852743 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1344) (512y: 191) (512z: 2311) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests.