diff --git a/.github/workflows/testsuite_allprocesses.yml b/.github/workflows/testsuite_allprocesses.yml index 7eaad09c9f..1d67a9efaf 100644 --- a/.github/workflows/testsuite_allprocesses.yml +++ b/.github/workflows/testsuite_allprocesses.yml @@ -19,8 +19,9 @@ on: branches: [ master ] # Trigger the all-processes workflow when new changes to the workflow are pushed - push: - paths: [ .github/workflows/testsuite* ] + # (NB: this is now disabled to avoid triggering two jobs when pushing to a branch for which a PR is opened) + ###push: + ### paths: [ .github/workflows/testsuite* ] #---------------------------------------------------------------------------------------------------------------------------------- diff --git a/.github/workflows/testsuite_oneprocess.sh b/.github/workflows/testsuite_oneprocess.sh index 639991f1cb..d49d70c200 100755 --- a/.github/workflows/testsuite_oneprocess.sh +++ b/.github/workflows/testsuite_oneprocess.sh @@ -35,19 +35,23 @@ function codegen() { ./CODEGEN/generateAndCompare.sh -q ${proc%.sa} fi # Check if there are any differences to the current repo - ###compare=true # enable comparison to current git repo - compare=false # disable comparison to current git repo - if [ ${compare} ] && [ "$(git ls-tree --name-only HEAD ${proc})" != "" ]; then + ###compare=1 # enable comparison to current git repo + compare=0 # disable comparison to current git repo + if [ "${compare}" != "0" ] && [ "$(git ls-tree --name-only HEAD ${proc})" != "" ]; then + echo + echo "Compare newly generated code for ${proc} to that in the madgraph4gpu github repository" git checkout HEAD ${proc}/CODEGEN*.txt if [ "${proc%.mad}" != "${proc}" ]; then git checkout HEAD ${proc}/Cards/me5_configuration.txt ###sed -i 's/DEFAULT_F2PY_COMPILER=f2py.*/DEFAULT_F2PY_COMPILER=f2py3/' ${proc}/Source/make_opts git checkout HEAD ${proc}/Source/make_opts fi - echo echo "git diff (start)" git diff --exit-code echo "git diff (end)" + else + echo + echo "(SKIP comparison of newly generated code for ${proc} to that in the madgraph4gpu github repository)" fi } diff --git a/MG5aMC/mg5amcnlo b/MG5aMC/mg5amcnlo index d8c1613ccf..23f61b93fd 160000 --- a/MG5aMC/mg5amcnlo +++ b/MG5aMC/mg5amcnlo @@ -1 +1 @@ -Subproject commit d8c1613ccf638b5b078a64379e385def5649622c +Subproject commit 23f61b93fdf268a1cdcbd363cd449c88b3511d7a diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/launch_plugin.py b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/launch_plugin.py index 0b849330ef..3b09713e12 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/launch_plugin.py +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/launch_plugin.py @@ -16,22 +16,28 @@ import internal.misc as misc import internal.extended_cmd as extended_cmd import internal.banner as banner_mod + import internal.common_run_interface as common_run_interface else: import madgraph.interface.madevent_interface as madevent_interface import madgraph.various.misc as misc import madgraph.interface.extended_cmd as extended_cmd import madgraph.various.banner as banner_mod + import madgraph.interface.common_run_interface as common_run_interface class CPPMEInterface(madevent_interface.MadEventCmdShell): def compile(self, *args, **opts): """ """ import multiprocessing if not self.options['nb_core'] or self.options['nb_core'] == 'None': - self.options['nb_core'] = multiprocessing.cpu_count() - if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'): - import pathlib - import os - pjoin = os.path.join + self.options['nb_core'] = multiprocessing.cpu_count() + if 'cwd' in opts and os.path.basename(opts['cwd']) == 'Source': + path = pjoin(opts['cwd'], 'make_opts') + avx_level = self.run_card['avx_level'] if self.run_card['avx_level'] != 'auto' else '' + common_run_interface.CommonRunCmd.update_make_opts_full(path, + {'FPTYPE': self.run_card['floating_type'], + 'AVX': avx_level }) + misc.sprint('FPTYPE checked') + if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'): cudacpp_backend = self.run_card['cudacpp_backend'].upper() # the default value is defined in banner.py logger.info("Building madevent in madevent_interface.py with '%s' matrix elements"%cudacpp_backend) if cudacpp_backend == 'FORTRAN': @@ -46,30 +52,83 @@ def compile(self, *args, **opts): else: return misc.compile(nb_core=self.options['nb_core'], *args, **opts) +# Phase-Space Optimization ------------------------------------------------------------------------------------ +template_on = \ +"""#*********************************************************************** +# SIMD/GPU configuration for the CUDACPP plugin +#************************************************************************ + %(floating_type)s = floating_type ! floating point precision: f (single), d (double), m (mixed: double for amplitudes, single for colors) + %(avx_level)s = avx_level ! SIMD vectorization level: none, sse4, avx2, 512y, 512z, auto + %(cudacpp_backend)s = cudacpp_backend ! CUDACPP backend: FORTRAN, CPP, CUDA +""" + +template_off = '' +plugin_block = banner_mod.RunBlock('simd', template_on=template_on, template_off=template_off) + class CPPRunCard(banner_mod.RunCardLO): + blocks = banner_mod.RunCardLO.blocks + [plugin_block] + def reset_simd(self, old_value, new_value, name): if not hasattr(self, 'path'): - logger.warning('WARNING! CPPRunCard instance has no attribute path') - return - ###raise Exception('INTERNAL ERROR! CPPRunCard instance has no attribute path') + raise Exception('INTERNAL ERROR! CPPRunCard instance has no attribute path') # now ok after fixing #790 if name == "vector_size" and new_value <= int(old_value): # code can handle the new size -> do not recompile return Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source') subprocess.call(['make', 'cleanavx'], cwd=Sourcedir, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) + def reset_makeopts(self, old_value, new_value, name): + if not hasattr(self, 'path'): + raise Exception + avx_value = self['avx_level'] if self['avx_level'] != 'auto' else '' + if name == 'floating_type': + common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': new_value, 'AVX': avx_value}) + elif name == 'avx_level': + if new_value == 'auto': new_value = '' + common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': self['floating_type'], 'AVX': new_value}) + else: + raise Exception + Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source') + subprocess.call(['make', 'cleanavx'], cwd=Sourcedir, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) + def plugin_input(self, finput): return def default_setup(self): super().default_setup() - self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False) + self.add_param('floating_type', 'd', include=False, hidden=False, + fct_mod=(self.reset_makeopts,(),{}), + allowed=['m','d','f']) + self.add_param('avx_level', 'auto', include=False, hidden=False, + fct_mod=(self.reset_makeopts,(),{}), + allowed=['auto', 'none', 'sse4', 'avx2','512y','512z']) + self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False, + allowed=['Fortan', 'CPP', 'CUDA']) + self['vector_size'] = 16 # already setup in default class (just change value) + self['aloha_flag'] = '--fast-math' + self['matrix_flag'] = '-O3' + self.display_block.append('simd') + self.display_block.append('psoptim') + # OM/AV - overload the default version in banner.py def write_one_include_file(self, output_dir, incname, output_file=None): """write one include file at the time""" - if incname == "vector.inc" and 'vector_size' not in self.user_set: - return - super().write_one_include_file(output_dir, incname, output_file) + if incname == "vector.inc": + if 'vector_size' not in self.user_set: return + if output_file is None: vectorinc=pjoin(output_dir,incname) + else: vectorinc=output_file + with open(vectorinc+'.new','w') as fileout: + with open(vectorinc) as filein: + for line in filein: + if line.startswith('C'): fileout.write(line) + super().write_one_include_file(output_dir, incname, output_file) + with open(vectorinc+'.new','a') as fileout: + with open(vectorinc) as filein: + for line in filein: + if not line.startswith('\n'): fileout.write(line) + os.replace(vectorinc+'.new',vectorinc) + else: + super().write_one_include_file(output_dir, incname, output_file) def check_validity(self): """ensure that PLUGIN information are consistent""" @@ -82,13 +141,10 @@ def check_validity(self): class GPURunCard(CPPRunCard): def default_setup(self): - super(CPPRunCard, self).default_setup() - self.add_param('cudacpp_backend', 'CUDA', include=False, hidden=False) - -#class CUDACPPRunCard(CPPRunCard): -# def default_setup(self): -# super(CPPRunCard, self).default_setup() -# self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False) + super().default_setup() + # change default value: + self['cudacpp_backend'] = 'CUDA' + self['vector_size'] = 16384 # already setup in default class (just change value) MEINTERFACE = CPPMEInterface RunCard = CPPRunCard diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py index 5b557e832a..5267141530 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py @@ -149,6 +149,11 @@ class PLUGIN_ProcessExporter(PLUGIN_export_cpp.ProcessExporterGPU): ###helas_exporter = None helas_exporter = model_handling.PLUGIN_GPUFOHelasCallWriter # this is one of the main fixes for issue #341! + # Default class for the run_card to use + from . import launch_plugin + run_card_class = launch_plugin.CPPRunCard + + # AV (default from OM's tutorial) - add a debug printout def __init__(self, *args, **kwargs): self.in_madevent_mode = False # see MR #747 @@ -203,7 +208,7 @@ def finalize(self, matrix_element, cmdhistory, MG5options, outputflag): cmdhistory is the list of command used so far. MG5options are all the options of the main interface outputflags is a list of options provided when doing the output command""" - misc.sprint('Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self)) + ###misc.sprint('Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self)) if self.in_madevent_mode: self.add_input_for_banner() if 'CUDACPP_CODEGEN_PATCHLEVEL' in os.environ: patchlevel = os.environ['CUDACPP_CODEGEN_PATCHLEVEL'] @@ -237,7 +242,8 @@ def finalize(self, matrix_element, cmdhistory, MG5options, outputflag): raise Exception('ERROR! the O/S call to patchMad.sh failed') # Additional patching (OM) self.add_madevent_plugin_fct() # Added by OM - return super().finalize(matrix_element, cmdhistory, MG5options, outputflag) + # do not call standard finalize since is this is already done... + #return super().finalize(matrix_element, cmdhistory, MG5options, outputflag) # AV (default from OM's tutorial) - overload settings and add a debug printout def modify_grouping(self, matrix_element): diff --git a/epochX/cudacpp/CODEGEN/generateAndCompare.sh b/epochX/cudacpp/CODEGEN/generateAndCompare.sh index 1f416ac5b9..f3f830205c 100755 --- a/epochX/cudacpp/CODEGEN/generateAndCompare.sh +++ b/epochX/cudacpp/CODEGEN/generateAndCompare.sh @@ -152,7 +152,7 @@ function codeGenAndDiff() echo -e "\n+++ Generate code for '$proc'\n" ###exit 0 # FOR DEBUGGING # Vector size for mad/madonly meexporter (VECSIZE_MEMMAX) - vecsize=16384 # NB THIS IS NO LONGER IGNORED (but will eventually be tunable via runcards) + vecsize=32 # NB THIS IS NO LONGER IGNORED (but will eventually be tunable via runcards) # Generate code for the specific process pushd $MG5AMC_HOME >& /dev/null mkdir -p ../TMPOUT @@ -201,13 +201,28 @@ function codeGenAndDiff() cat ${outproc}_log.txt | egrep -v '(Crash Annotation)' > ${outproc}_log.txt.new # remove firefox 'glxtest: libEGL initialize failed' errors \mv ${outproc}_log.txt.new ${outproc}_log.txt fi + # Check the code generation log for errors + if [ -d ${outproc} ] && ! grep -q "Please report this bug" ${outproc}_log.txt; then + ###cat ${outproc}_log.txt; exit 0 # FOR DEBUGGING + cat ${MG5AMC_HOME}/${outproc}_log.txt | { egrep 'INFO: (Try|Creat|Organiz|Process)' || true; } + else + echo "*** ERROR! Code generation failed" + cat ${MG5AMC_HOME}/${outproc}_log.txt + echo "*** ERROR! Code generation failed" + exit 1 + fi # Patches moved here from patchMad.sh after Olivier's PR #764 (THIS IS ONLY NEEDED IN THE MADGRAPH4GPU GIT REPO) if [ "${OUTBCK}" == "mad" ]; then # Force the use of strategy SDE=1 in multichannel mode (see #419) sed -i 's/2 = sde_strategy/1 = sde_strategy/' ${outproc}/Cards/run_card.dat + # Force the use of VECSIZE_MEMMAX=16384 + sed -i 's/16 = vector_size/16384 = vector_size/' ${outproc}/Cards/run_card.dat + # Force the use of fast-math in Fortran builds + sed -i 's/-O = global_flag.*/-O3 -ffast-math -fbounds-check = global_flag ! build flags for all Fortran code (for a fair comparison to cudacpp; default is -O)/' ${outproc}/Cards/run_card.dat # Generate run_card.inc and param_card.inc (include stdout and stderr in the code generation log which is later checked for errors) # These two steps are part of "cd Source; make" but they actually are code-generating steps - ${outproc}/bin/madevent treatcards run >> ${outproc}_log.txt 2>&1 # AV BUG! THIS MAY SILENTLY FAIL (check if output contains "Please report this bug") + # Note: treatcards run also regenerates vector.inc if vector_size has changed in the runcard + ${outproc}/bin/madevent treatcards run >> ${outproc}_log.txt 2>&1 # AV BUG! THIS MAY SILENTLY FAIL (check if output contains "Please report this bug") ${outproc}/bin/madevent treatcards param >> ${outproc}_log.txt 2>&1 # AV BUG! THIS MAY SILENTLY FAIL (check if output contains "Please report this bug") # Cleanup \rm -f ${outproc}/crossx.html @@ -223,37 +238,18 @@ function codeGenAndDiff() touch ${outproc}/HTML/.keep # new file if [ "${patchlevel}" != "0" ]; then # Add global flag '-O3 -ffast-math -fbounds-check' as in previous gridpacks + # (FIXME? these flags are already set in the runcards, why are they not propagated to make_opts?) echo "GLOBAL_FLAG=-O3 -ffast-math -fbounds-check" > ${outproc}/Source/make_opts.new cat ${outproc}/Source/make_opts >> ${outproc}/Source/make_opts.new \mv ${outproc}/Source/make_opts.new ${outproc}/Source/make_opts fi if [ "${patchlevel}" == "2" ]; then sed -i 's/DEFAULT_F2PY_COMPILER=f2py.*/DEFAULT_F2PY_COMPILER=f2py3/' ${outproc}/Source/make_opts - cat ${outproc}/Source/make_opts | sed '/#end/q' | sort > ${outproc}/Source/make_opts.new + cat ${outproc}/Source/make_opts | sed '/#end/q' | head --lines=-1 | sort > ${outproc}/Source/make_opts.new cat ${outproc}/Source/make_opts | sed -n -e '/#end/,$p' >> ${outproc}/Source/make_opts.new \mv ${outproc}/Source/make_opts.new ${outproc}/Source/make_opts - echo " -#********************************************************************* -# Options for the cudacpp plugin -#********************************************************************* - -# Set cudacpp-specific values of non-cudacpp-specific options --O3 -ffast-math -fbounds-check = global_flag ! build flags for Fortran code (for a fair comparison to cudacpp) - -# New cudacpp-specific options (default values are defined in banner.py) -CPP = cudacpp_backend ! valid backends are FORTRAN, CPP, CUDA" >> ${outproc}/Cards/run_card.dat fi fi - # Check the code generation log for errors - if [ -d ${outproc} ] && ! grep -q "Please report this bug" ${outproc}_log.txt; then - ###cat ${outproc}_log.txt; exit 0 # FOR DEBUGGING - cat ${MG5AMC_HOME}/${outproc}_log.txt | { egrep 'INFO: (Try|Creat|Organiz|Process)' || true; } - else - echo "*** ERROR! Code generation failed" - cat ${MG5AMC_HOME}/${outproc}_log.txt - echo "*** ERROR! Code generation failed" - exit 1 - fi popd >& /dev/null # Choose which directory must be copied (for gridpack generation: untar and modify the gridpack) if [ "${SCRBCK}" == "gridpack" ]; then diff --git a/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt b/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt index e6546f684c..e090137829 100644 --- a/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt +++ b/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt @@ -62,7 +62,7 @@ generate e+ e- > mu+ mu- No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005372047424316406  +DEBUG: model prefixing takes 0.005680561065673828  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -156,15 +156,15 @@ INFO: Trying process: e+ e- > mu+ mu- WEIGHTED<=4 @1 INFO: Process has 2 diagrams 1 processes with 2 diagrams generated in 0.004 s Total: 1 processes with 2 diagrams -output madevent ../TMPOUT/CODEGEN_mad_ee_mumu --hel_recycling=False --vector_size=16384 --me_exporter=standalone_cudacpp +output madevent ../TMPOUT/CODEGEN_mad_ee_mumu --hel_recycling=False --vector_size=32 --me_exporter=standalone_cudacpp Load PLUGIN.CUDACPP_OUTPUT Addition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT Output will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: cformat =  standalone_cudacpp [export_cpp.py at line 3071]  -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 155]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 160]  INFO: initialize a new directory: CODEGEN_mad_ee_mumu INFO: remove old information in CODEGEN_mad_ee_mumu -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 160]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 165]  WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu  INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards  @@ -174,7 +174,7 @@ INFO: Generating Helas calls for process: e+ e- > mu+ mu- WEIGHTED<=4 @1 INFO: Processing color information for process: e+ e- > mu+ mu- @1 INFO: Creating files in directory P1_epem_mupmum DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -183,27 +183,27 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: config_map =  [1, 2] [export_cpp.py at line 711]  DEBUG: subproc_number =  0 [export_cpp.py at line 712]  DEBUG: Done [export_cpp.py at line 713]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872]  WARNING: vector code for lepton pdf not implemented. We removed the option to run dressed lepton  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872]  INFO: Generating Feynman diagrams for Process: e+ e- > mu+ mu- WEIGHTED<=4 @1 INFO: Finding symmetric diagrams for subprocess group epem_mupmum Generated helas calls for 1 subprocesses (2 diagrams) in 0.004 s -Wrote files for 8 helas calls in 0.098 s +Wrote files for 8 helas calls in 0.097 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFV2 routines ALOHA: aloha creates FFV4 routines -ALOHA: aloha creates 3 routines in 0.200 s -DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 197]  +ALOHA: aloha creates 3 routines in 0.199 s +DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFV2 routines ALOHA: aloha creates FFV4 routines ALOHA: aloha creates FFV2_4 routines -ALOHA: aloha creates 7 routines in 0.255 s +ALOHA: aloha creates 7 routines in 0.254 s FFV1 FFV1 FFV2 @@ -226,7 +226,6 @@ save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CO INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate web pages -DEBUG: 'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) =  Entering PLUGIN_ProcessExporter.finalize True [output.py at line 206]  DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common patching file Source/genps.inc patching file Source/makefile @@ -242,16 +241,16 @@ patching file matrix1.f Hunk #3 succeeded at 230 (offset 9 lines). Hunk #4 succeeded at 267 (offset 18 lines). Hunk #5 succeeded at 312 (offset 18 lines). -DEBUG: p.returncode =  0 [output.py at line 232]  +DEBUG: p.returncode =  0 [output.py at line 237]  Output to directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu done. Type "launch" to generate events from this process, or see /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/README Run "open index.html" to see more information about this process. quit -real 0m4.853s -user 0m1.653s -sys 0m0.201s +real 0m1.848s +user 0m1.621s +sys 0m0.225s ************************************************************ * * * W E L C O M E to * @@ -279,10 +278,6 @@ Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt No valid web browser found. Please set in ./input/mg5_configuration.txt treatcards run -run_card missed argument cudacpp_backend. Takes default: CPP -run_card missed argument cudacpp_backend. Takes default: CPP -run_card missed argument cudacpp_backend. Takes default: CPP -WARNING! CPPRunCard instance has no attribute path quit INFO: launch in debug mode diff --git a/epochX/cudacpp/ee_mumu.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/ee_mumu.mad/Cards/proc_card_mg5.dat index 618adbca06..22e76563ab 100644 --- a/epochX/cudacpp/ee_mumu.mad/Cards/proc_card_mg5.dat +++ b/epochX/cudacpp/ee_mumu.mad/Cards/proc_card_mg5.dat @@ -46,4 +46,4 @@ define l- = e- mu- define vl = ve vm vt define vl~ = ve~ vm~ vt~ output madevent ../TMPOUT/CODEGEN_mad_ee_mumu --hel_recycling=False --\ -vector_size=16384 --me_exporter=standalone_cudacpp +vector_size=32 --me_exporter=standalone_cudacpp diff --git a/epochX/cudacpp/ee_mumu.mad/Cards/run_card.dat b/epochX/cudacpp/ee_mumu.mad/Cards/run_card.dat index 86f6a33258..1084532333 100644 --- a/epochX/cudacpp/ee_mumu.mad/Cards/run_card.dat +++ b/epochX/cudacpp/ee_mumu.mad/Cards/run_card.dat @@ -85,7 +85,23 @@ 1 = sde_strategy ! default integration strategy (hep-ph/2021.00773) ! 1 is old strategy (using amp square) ! 2 is new strategy (using only the denominator) -# To see advanced option for Phase-Space optimization: type "update psoptim" +#********************************************************************* +# Phase-Space Optim (advanced) +#********************************************************************* + 0 = job_strategy ! see appendix of 1507.00020 (page 26) + 0 = hard_survey ! force to have better estimate of the integral at survey for difficult mode like interference + -1.0 = tmin_for_channel ! limit the non-singular reach of --some-- channel of integration related to T-channel diagram (value between -1 and 0), -1 is no impact + -1 = survey_splitting ! for loop-induced control how many core are used at survey for the computation of a single iteration. + 2 = survey_nchannel_per_job ! control how many Channel are integrated inside a single job on cluster/multicore + -1 = refine_evt_by_job ! control the maximal number of events for the first iteration of the refine (larger means less jobs) +#********************************************************************* +# Compilation flag. +#********************************************************************* + -O3 -ffast-math -fbounds-check = global_flag ! build flags for all Fortran code (for a fair comparison to cudacpp; default is -O) + --fast-math = aloha_flag ! fortran optimization flag for aloha function. Suggestions: '-ffast-math' + -O3 = matrix_flag ! fortran optimization flag for matrix.f function. Suggestions: '-O3' + 16384 = vector_size ! size of fortran arrays allocated in the multi-event API for SIMD/GPU (VECSIZE_MEMMAX) + #********************************************************************* # Customization (custom cuts/scale/bias/...) * # list of files containing fortran function that overwrite default * @@ -182,12 +198,10 @@ systematics = systematics_program ! none, systematics [python], SysCalc [depreceted, C++] ['--mur=0.5,1,2', '--muf=0.5,1,2', '--pdf=errorset'] = systematics_arguments ! see: https://cp3.irmp.ucl.ac.be/projects/madgraph/wiki/Systematics#Systematicspythonmodule -#********************************************************************* -# Options for the cudacpp plugin -#********************************************************************* - -# Set cudacpp-specific values of non-cudacpp-specific options --O3 -ffast-math -fbounds-check = global_flag ! build flags for Fortran code (for a fair comparison to cudacpp) +#*********************************************************************** +# SIMD/GPU configuration for the CUDACPP plugin +#************************************************************************ + d = floating_type ! floating point precision: f (single), d (double), m (mixed: double for amplitudes, single for colors) + auto = avx_level ! SIMD vectorization level: none, sse4, avx2, 512y, 512z, auto + CPP = cudacpp_backend ! CUDACPP backend: FORTRAN, CPP, CUDA -# New cudacpp-specific options (default values are defined in banner.py) -CPP = cudacpp_backend ! valid backends are FORTRAN, CPP, CUDA diff --git a/epochX/cudacpp/ee_mumu.mad/Cards/run_card_default.dat b/epochX/cudacpp/ee_mumu.mad/Cards/run_card_default.dat index 67c1a4de28..0fa0402261 100644 --- a/epochX/cudacpp/ee_mumu.mad/Cards/run_card_default.dat +++ b/epochX/cudacpp/ee_mumu.mad/Cards/run_card_default.dat @@ -85,7 +85,23 @@ 2 = sde_strategy ! default integration strategy (hep-ph/2021.00773) ! 1 is old strategy (using amp square) ! 2 is new strategy (using only the denominator) -# To see advanced option for Phase-Space optimization: type "update psoptim" +#********************************************************************* +# Phase-Space Optim (advanced) +#********************************************************************* + 0 = job_strategy ! see appendix of 1507.00020 (page 26) + 0 = hard_survey ! force to have better estimate of the integral at survey for difficult mode like interference + -1.0 = tmin_for_channel ! limit the non-singular reach of --some-- channel of integration related to T-channel diagram (value between -1 and 0), -1 is no impact + -1 = survey_splitting ! for loop-induced control how many core are used at survey for the computation of a single iteration. + 2 = survey_nchannel_per_job ! control how many Channel are integrated inside a single job on cluster/multicore + -1 = refine_evt_by_job ! control the maximal number of events for the first iteration of the refine (larger means less jobs) +#********************************************************************* +# Compilation flag. +#********************************************************************* + -O = global_flag ! fortran optimization flag use for the all code. + --fast-math = aloha_flag ! fortran optimization flag for aloha function. Suggestions: '-ffast-math' + -O3 = matrix_flag ! fortran optimization flag for matrix.f function. Suggestions: '-O3' + 16 = vector_size ! size of fortran arrays allocated in the multi-event API for SIMD/GPU (VECSIZE_MEMMAX) + #********************************************************************* # Customization (custom cuts/scale/bias/...) * # list of files containing fortran function that overwrite default * @@ -181,3 +197,11 @@ # systematics = systematics_program ! none, systematics [python], SysCalc [depreceted, C++] ['--mur=0.5,1,2', '--muf=0.5,1,2', '--pdf=errorset'] = systematics_arguments ! see: https://cp3.irmp.ucl.ac.be/projects/madgraph/wiki/Systematics#Systematicspythonmodule + +#*********************************************************************** +# SIMD/GPU configuration for the CUDACPP plugin +#************************************************************************ + d = floating_type ! floating point precision: f (single), d (double), m (mixed: double for amplitudes, single for colors) + auto = avx_level ! SIMD vectorization level: none, sse4, avx2, 512y, 512z, auto + CPP = cudacpp_backend ! CUDACPP backend: FORTRAN, CPP, CUDA + diff --git a/epochX/cudacpp/ee_mumu.mad/Source/make_opts b/epochX/cudacpp/ee_mumu.mad/Source/make_opts index 6409e99a49..e4b87ee6ad 100644 --- a/epochX/cudacpp/ee_mumu.mad/Source/make_opts +++ b/epochX/cudacpp/ee_mumu.mad/Source/make_opts @@ -1,7 +1,6 @@ DEFAULT_CPP_COMPILER=g++ DEFAULT_F2PY_COMPILER=f2py3 DEFAULT_F_COMPILER=gfortran -#end_of_make_opts_variables GLOBAL_FLAG=-O3 -ffast-math -fbounds-check MACFLAG= MG5AMC_VERSION=SpecifiedByMG5aMCAtRunTime diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/banner.py b/epochX/cudacpp/ee_mumu.mad/bin/internal/banner.py index 3995ce8109..bd1517985f 100755 --- a/epochX/cudacpp/ee_mumu.mad/bin/internal/banner.py +++ b/epochX/cudacpp/ee_mumu.mad/bin/internal/banner.py @@ -2822,13 +2822,15 @@ def add_param(self, name, value, fortran_name=None, include=True, if fct_mod: self.fct_mod[name] = fct_mod - def read(self, finput, consistency=True, unknown_warning=True): + def read(self, finput, consistency=True, unknown_warning=True, **opt): """Read the input file, this can be a path to a file, a file object, a str with the content of the file.""" if isinstance(finput, str): if "\n" in finput: finput = finput.split('\n') + if 'path' in opt: + self.path = opt['path'] elif os.path.isfile(finput): self.path = finput finput = open(finput) @@ -3913,7 +3915,7 @@ def remove_all_cut(self): %(global_flag)s = global_flag ! fortran optimization flag use for the all code. %(aloha_flag)s = aloha_flag ! fortran optimization flag for aloha function. Suggestions: '-ffast-math' %(matrix_flag)s = matrix_flag ! fortran optimization flag for matrix.f function. Suggestions: '-O3' - %(vector_size)s = vector_size ! size designed for SIMD/OpenMP/GPU (number of events in lockstep) + %(vector_size)s = vector_size ! size of fortran arrays allocated in the multi-event API for SIMD/GPU (VECSIZE_MEMMAX) """ template_off = '# To see advanced option for Phase-Space optimization: type "update psoptim"' diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/ee_mumu.mad/bin/internal/launch_plugin.py index 0b849330ef..3b09713e12 100644 --- a/epochX/cudacpp/ee_mumu.mad/bin/internal/launch_plugin.py +++ b/epochX/cudacpp/ee_mumu.mad/bin/internal/launch_plugin.py @@ -16,22 +16,28 @@ import internal.misc as misc import internal.extended_cmd as extended_cmd import internal.banner as banner_mod + import internal.common_run_interface as common_run_interface else: import madgraph.interface.madevent_interface as madevent_interface import madgraph.various.misc as misc import madgraph.interface.extended_cmd as extended_cmd import madgraph.various.banner as banner_mod + import madgraph.interface.common_run_interface as common_run_interface class CPPMEInterface(madevent_interface.MadEventCmdShell): def compile(self, *args, **opts): """ """ import multiprocessing if not self.options['nb_core'] or self.options['nb_core'] == 'None': - self.options['nb_core'] = multiprocessing.cpu_count() - if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'): - import pathlib - import os - pjoin = os.path.join + self.options['nb_core'] = multiprocessing.cpu_count() + if 'cwd' in opts and os.path.basename(opts['cwd']) == 'Source': + path = pjoin(opts['cwd'], 'make_opts') + avx_level = self.run_card['avx_level'] if self.run_card['avx_level'] != 'auto' else '' + common_run_interface.CommonRunCmd.update_make_opts_full(path, + {'FPTYPE': self.run_card['floating_type'], + 'AVX': avx_level }) + misc.sprint('FPTYPE checked') + if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'): cudacpp_backend = self.run_card['cudacpp_backend'].upper() # the default value is defined in banner.py logger.info("Building madevent in madevent_interface.py with '%s' matrix elements"%cudacpp_backend) if cudacpp_backend == 'FORTRAN': @@ -46,30 +52,83 @@ def compile(self, *args, **opts): else: return misc.compile(nb_core=self.options['nb_core'], *args, **opts) +# Phase-Space Optimization ------------------------------------------------------------------------------------ +template_on = \ +"""#*********************************************************************** +# SIMD/GPU configuration for the CUDACPP plugin +#************************************************************************ + %(floating_type)s = floating_type ! floating point precision: f (single), d (double), m (mixed: double for amplitudes, single for colors) + %(avx_level)s = avx_level ! SIMD vectorization level: none, sse4, avx2, 512y, 512z, auto + %(cudacpp_backend)s = cudacpp_backend ! CUDACPP backend: FORTRAN, CPP, CUDA +""" + +template_off = '' +plugin_block = banner_mod.RunBlock('simd', template_on=template_on, template_off=template_off) + class CPPRunCard(banner_mod.RunCardLO): + blocks = banner_mod.RunCardLO.blocks + [plugin_block] + def reset_simd(self, old_value, new_value, name): if not hasattr(self, 'path'): - logger.warning('WARNING! CPPRunCard instance has no attribute path') - return - ###raise Exception('INTERNAL ERROR! CPPRunCard instance has no attribute path') + raise Exception('INTERNAL ERROR! CPPRunCard instance has no attribute path') # now ok after fixing #790 if name == "vector_size" and new_value <= int(old_value): # code can handle the new size -> do not recompile return Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source') subprocess.call(['make', 'cleanavx'], cwd=Sourcedir, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) + def reset_makeopts(self, old_value, new_value, name): + if not hasattr(self, 'path'): + raise Exception + avx_value = self['avx_level'] if self['avx_level'] != 'auto' else '' + if name == 'floating_type': + common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': new_value, 'AVX': avx_value}) + elif name == 'avx_level': + if new_value == 'auto': new_value = '' + common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': self['floating_type'], 'AVX': new_value}) + else: + raise Exception + Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source') + subprocess.call(['make', 'cleanavx'], cwd=Sourcedir, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) + def plugin_input(self, finput): return def default_setup(self): super().default_setup() - self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False) + self.add_param('floating_type', 'd', include=False, hidden=False, + fct_mod=(self.reset_makeopts,(),{}), + allowed=['m','d','f']) + self.add_param('avx_level', 'auto', include=False, hidden=False, + fct_mod=(self.reset_makeopts,(),{}), + allowed=['auto', 'none', 'sse4', 'avx2','512y','512z']) + self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False, + allowed=['Fortan', 'CPP', 'CUDA']) + self['vector_size'] = 16 # already setup in default class (just change value) + self['aloha_flag'] = '--fast-math' + self['matrix_flag'] = '-O3' + self.display_block.append('simd') + self.display_block.append('psoptim') + # OM/AV - overload the default version in banner.py def write_one_include_file(self, output_dir, incname, output_file=None): """write one include file at the time""" - if incname == "vector.inc" and 'vector_size' not in self.user_set: - return - super().write_one_include_file(output_dir, incname, output_file) + if incname == "vector.inc": + if 'vector_size' not in self.user_set: return + if output_file is None: vectorinc=pjoin(output_dir,incname) + else: vectorinc=output_file + with open(vectorinc+'.new','w') as fileout: + with open(vectorinc) as filein: + for line in filein: + if line.startswith('C'): fileout.write(line) + super().write_one_include_file(output_dir, incname, output_file) + with open(vectorinc+'.new','a') as fileout: + with open(vectorinc) as filein: + for line in filein: + if not line.startswith('\n'): fileout.write(line) + os.replace(vectorinc+'.new',vectorinc) + else: + super().write_one_include_file(output_dir, incname, output_file) def check_validity(self): """ensure that PLUGIN information are consistent""" @@ -82,13 +141,10 @@ def check_validity(self): class GPURunCard(CPPRunCard): def default_setup(self): - super(CPPRunCard, self).default_setup() - self.add_param('cudacpp_backend', 'CUDA', include=False, hidden=False) - -#class CUDACPPRunCard(CPPRunCard): -# def default_setup(self): -# super(CPPRunCard, self).default_setup() -# self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False) + super().default_setup() + # change default value: + self['cudacpp_backend'] = 'CUDA' + self['vector_size'] = 16384 # already setup in default class (just change value) MEINTERFACE = CPPMEInterface RunCard = CPPRunCard diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/madevent_interface.py b/epochX/cudacpp/ee_mumu.mad/bin/internal/madevent_interface.py index 853aabc98a..cb6bf4ca57 100755 --- a/epochX/cudacpp/ee_mumu.mad/bin/internal/madevent_interface.py +++ b/epochX/cudacpp/ee_mumu.mad/bin/internal/madevent_interface.py @@ -3191,7 +3191,7 @@ def do_treatcards(self, line, mode=None, opt=None): if mode in ['run', 'all']: if not hasattr(self, 'run_card'): - run_card = banner_mod.RunCard(opt['run_card']) + run_card = banner_mod.RunCard(opt['run_card'], path=pjoin(self.me_dir, 'Cards', 'run_card.dat')) else: run_card = self.run_card self.run_card = run_card diff --git a/epochX/cudacpp/ee_mumu.mad/mg5.in b/epochX/cudacpp/ee_mumu.mad/mg5.in index 12a2c58512..4e83015b40 100644 --- a/epochX/cudacpp/ee_mumu.mad/mg5.in +++ b/epochX/cudacpp/ee_mumu.mad/mg5.in @@ -1,4 +1,4 @@ set stdout_level DEBUG set zerowidth_tchannel F generate e+ e- > mu+ mu- -output madevent ee_mumu.mad --hel_recycling=False --vector_size=16384 --me_exporter=standalone_cudacpp +output madevent ee_mumu.mad --hel_recycling=False --vector_size=32 --me_exporter=standalone_cudacpp diff --git a/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt b/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt index 8cb80f0d38..3447ea61e3 100644 --- a/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt +++ b/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt @@ -62,7 +62,7 @@ generate e+ e- > mu+ mu- No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005633831024169922  +DEBUG: model prefixing takes 0.0055027008056640625  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -154,34 +154,34 @@ INFO: Checking for minimal orders which gives processes. INFO: Please specify coupling orders to bypass this step. INFO: Trying process: e+ e- > mu+ mu- WEIGHTED<=4 @1 INFO: Process has 2 diagrams -1 processes with 2 diagrams generated in 0.005 s +1 processes with 2 diagrams generated in 0.004 s Total: 1 processes with 2 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_ee_mumu Load PLUGIN.CUDACPP_OUTPUT Output will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: cformat =  plugin [export_cpp.py at line 3071]  -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 155]  -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 160]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 160]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 165]  INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: e+ e- > mu+ mu- WEIGHTED<=4 @1 INFO: Processing color information for process: e+ e- > mu+ mu- @1 -DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 189]  -DEBUG: type(subproc_group)= [output.py at line 190]  -DEBUG: type(fortran_model)= [output.py at line 191]  -DEBUG: type(me)= me=0 [output.py at line 192]  +DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 194]  +DEBUG: type(subproc_group)= [output.py at line 195]  +DEBUG: type(fortran_model)= [output.py at line 196]  +DEBUG: type(me)= me=0 [output.py at line 197]  INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/./CPPProcess.h FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/./CPPProcess.cc INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/. -Generated helas calls for 1 subprocesses (2 diagrams) in 0.003 s -DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 197]  +Generated helas calls for 1 subprocesses (2 diagrams) in 0.004 s +DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFV2 routines ALOHA: aloha creates FFV4 routines ALOHA: aloha creates FFV2_4 routines -ALOHA: aloha creates 4 routines in 0.267 s +ALOHA: aloha creates 4 routines in 0.270 s FFV1 FFV1 FFV2 @@ -198,9 +198,8 @@ FileWriter for / FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/. -DEBUG: 'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) =  Entering PLUGIN_ProcessExporter.finalize False [output.py at line 206]  quit -real 0m3.653s -user 0m0.601s -sys 0m0.049s +real 0m0.657s +user 0m0.602s +sys 0m0.050s diff --git a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt index a1fa47508f..3326a8488f 100644 --- a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt +++ b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005694150924682617  +DEBUG: model prefixing takes 0.005394458770751953  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -157,15 +157,15 @@ INFO: Trying process: g g > t t~ WEIGHTED<=2 @1 INFO: Process has 3 diagrams 1 processes with 3 diagrams generated in 0.008 s Total: 1 processes with 3 diagrams -output madevent ../TMPOUT/CODEGEN_mad_gg_tt --hel_recycling=False --vector_size=16384 --me_exporter=standalone_cudacpp +output madevent ../TMPOUT/CODEGEN_mad_gg_tt --hel_recycling=False --vector_size=32 --me_exporter=standalone_cudacpp Load PLUGIN.CUDACPP_OUTPUT Addition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT Output will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: cformat =  standalone_cudacpp [export_cpp.py at line 3071]  -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 155]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 160]  INFO: initialize a new directory: CODEGEN_mad_gg_tt INFO: remove old information in CODEGEN_mad_gg_tt -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 160]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 165]  WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt  INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards  @@ -175,7 +175,7 @@ INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1 INFO: Processing color information for process: g g > t t~ @1 INFO: Creating files in directory P1_gg_ttx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -184,23 +184,23 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: config_map =  [1, 2, 3] [export_cpp.py at line 711]  DEBUG: subproc_number =  0 [export_cpp.py at line 712]  DEBUG: Done [export_cpp.py at line 713]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872]  INFO: Generating Feynman diagrams for Process: g g > t t~ WEIGHTED<=2 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttx Generated helas calls for 1 subprocesses (3 diagrams) in 0.006 s -Wrote files for 10 helas calls in 0.101 s +Wrote files for 10 helas calls in 0.102 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 2 routines in 0.145 s -DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 197]  +ALOHA: aloha creates 2 routines in 0.142 s +DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 4 routines in 0.132 s +ALOHA: aloha creates 4 routines in 0.138 s VVV1 FFV1 FFV1 @@ -219,7 +219,6 @@ save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CO INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate web pages -DEBUG: 'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) =  Entering PLUGIN_ProcessExporter.finalize True [output.py at line 206]  DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common patching file Source/genps.inc patching file Source/makefile @@ -231,16 +230,16 @@ DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/S patching file auto_dsig1.f patching file driver.f patching file matrix1.f -DEBUG: p.returncode =  0 [output.py at line 232]  +DEBUG: p.returncode =  0 [output.py at line 237]  Output to directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt done. Type "launch" to generate events from this process, or see /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/README Run "open index.html" to see more information about this process. quit -real 0m4.772s -user 0m1.470s -sys 0m0.223s +real 0m1.677s +user 0m1.453s +sys 0m0.213s ************************************************************ * * * W E L C O M E to * @@ -268,9 +267,6 @@ Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt No valid web browser found. Please set in ./input/mg5_configuration.txt treatcards run -run_card missed argument cudacpp_backend. Takes default: CPP -run_card missed argument cudacpp_backend. Takes default: CPP -WARNING! CPPRunCard instance has no attribute path quit INFO: launch in debug mode diff --git a/epochX/cudacpp/gg_tt.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/gg_tt.mad/Cards/proc_card_mg5.dat index 4c14989a3f..cf111e2e6d 100644 --- a/epochX/cudacpp/gg_tt.mad/Cards/proc_card_mg5.dat +++ b/epochX/cudacpp/gg_tt.mad/Cards/proc_card_mg5.dat @@ -46,4 +46,4 @@ define l- = e- mu- define vl = ve vm vt define vl~ = ve~ vm~ vt~ output madevent ../TMPOUT/CODEGEN_mad_gg_tt --hel_recycling=False --ve\ -ctor_size=16384 --me_exporter=standalone_cudacpp +ctor_size=32 --me_exporter=standalone_cudacpp diff --git a/epochX/cudacpp/gg_tt.mad/Cards/run_card.dat b/epochX/cudacpp/gg_tt.mad/Cards/run_card.dat index 80d07f317e..a6463dc262 100644 --- a/epochX/cudacpp/gg_tt.mad/Cards/run_card.dat +++ b/epochX/cudacpp/gg_tt.mad/Cards/run_card.dat @@ -80,7 +80,23 @@ 1 = sde_strategy ! default integration strategy (hep-ph/2021.00773) ! 1 is old strategy (using amp square) ! 2 is new strategy (using only the denominator) -# To see advanced option for Phase-Space optimization: type "update psoptim" +#********************************************************************* +# Phase-Space Optim (advanced) +#********************************************************************* + 0 = job_strategy ! see appendix of 1507.00020 (page 26) + 0 = hard_survey ! force to have better estimate of the integral at survey for difficult mode like interference + -1.0 = tmin_for_channel ! limit the non-singular reach of --some-- channel of integration related to T-channel diagram (value between -1 and 0), -1 is no impact + -1 = survey_splitting ! for loop-induced control how many core are used at survey for the computation of a single iteration. + 2 = survey_nchannel_per_job ! control how many Channel are integrated inside a single job on cluster/multicore + -1 = refine_evt_by_job ! control the maximal number of events for the first iteration of the refine (larger means less jobs) +#********************************************************************* +# Compilation flag. +#********************************************************************* + -O3 -ffast-math -fbounds-check = global_flag ! build flags for all Fortran code (for a fair comparison to cudacpp; default is -O) + --fast-math = aloha_flag ! fortran optimization flag for aloha function. Suggestions: '-ffast-math' + -O3 = matrix_flag ! fortran optimization flag for matrix.f function. Suggestions: '-O3' + 16384 = vector_size ! size of fortran arrays allocated in the multi-event API for SIMD/GPU (VECSIZE_MEMMAX) + #********************************************************************* # Customization (custom cuts/scale/bias/...) * # list of files containing fortran function that overwrite default * @@ -140,12 +156,10 @@ systematics = systematics_program ! none, systematics [python], SysCalc [depreceted, C++] ['--mur=0.5,1,2', '--muf=0.5,1,2', '--pdf=errorset'] = systematics_arguments ! see: https://cp3.irmp.ucl.ac.be/projects/madgraph/wiki/Systematics#Systematicspythonmodule -#********************************************************************* -# Options for the cudacpp plugin -#********************************************************************* - -# Set cudacpp-specific values of non-cudacpp-specific options --O3 -ffast-math -fbounds-check = global_flag ! build flags for Fortran code (for a fair comparison to cudacpp) +#*********************************************************************** +# SIMD/GPU configuration for the CUDACPP plugin +#************************************************************************ + d = floating_type ! floating point precision: f (single), d (double), m (mixed: double for amplitudes, single for colors) + auto = avx_level ! SIMD vectorization level: none, sse4, avx2, 512y, 512z, auto + CPP = cudacpp_backend ! CUDACPP backend: FORTRAN, CPP, CUDA -# New cudacpp-specific options (default values are defined in banner.py) -CPP = cudacpp_backend ! valid backends are FORTRAN, CPP, CUDA diff --git a/epochX/cudacpp/gg_tt.mad/Cards/run_card_default.dat b/epochX/cudacpp/gg_tt.mad/Cards/run_card_default.dat index 48d84c73f0..27e990a016 100644 --- a/epochX/cudacpp/gg_tt.mad/Cards/run_card_default.dat +++ b/epochX/cudacpp/gg_tt.mad/Cards/run_card_default.dat @@ -80,7 +80,23 @@ 2 = sde_strategy ! default integration strategy (hep-ph/2021.00773) ! 1 is old strategy (using amp square) ! 2 is new strategy (using only the denominator) -# To see advanced option for Phase-Space optimization: type "update psoptim" +#********************************************************************* +# Phase-Space Optim (advanced) +#********************************************************************* + 0 = job_strategy ! see appendix of 1507.00020 (page 26) + 0 = hard_survey ! force to have better estimate of the integral at survey for difficult mode like interference + -1.0 = tmin_for_channel ! limit the non-singular reach of --some-- channel of integration related to T-channel diagram (value between -1 and 0), -1 is no impact + -1 = survey_splitting ! for loop-induced control how many core are used at survey for the computation of a single iteration. + 2 = survey_nchannel_per_job ! control how many Channel are integrated inside a single job on cluster/multicore + -1 = refine_evt_by_job ! control the maximal number of events for the first iteration of the refine (larger means less jobs) +#********************************************************************* +# Compilation flag. +#********************************************************************* + -O = global_flag ! fortran optimization flag use for the all code. + --fast-math = aloha_flag ! fortran optimization flag for aloha function. Suggestions: '-ffast-math' + -O3 = matrix_flag ! fortran optimization flag for matrix.f function. Suggestions: '-O3' + 16 = vector_size ! size of fortran arrays allocated in the multi-event API for SIMD/GPU (VECSIZE_MEMMAX) + #********************************************************************* # Customization (custom cuts/scale/bias/...) * # list of files containing fortran function that overwrite default * @@ -139,3 +155,11 @@ # systematics = systematics_program ! none, systematics [python], SysCalc [depreceted, C++] ['--mur=0.5,1,2', '--muf=0.5,1,2', '--pdf=errorset'] = systematics_arguments ! see: https://cp3.irmp.ucl.ac.be/projects/madgraph/wiki/Systematics#Systematicspythonmodule + +#*********************************************************************** +# SIMD/GPU configuration for the CUDACPP plugin +#************************************************************************ + d = floating_type ! floating point precision: f (single), d (double), m (mixed: double for amplitudes, single for colors) + auto = avx_level ! SIMD vectorization level: none, sse4, avx2, 512y, 512z, auto + CPP = cudacpp_backend ! CUDACPP backend: FORTRAN, CPP, CUDA + diff --git a/epochX/cudacpp/gg_tt.mad/Source/make_opts b/epochX/cudacpp/gg_tt.mad/Source/make_opts index 6409e99a49..e4b87ee6ad 100644 --- a/epochX/cudacpp/gg_tt.mad/Source/make_opts +++ b/epochX/cudacpp/gg_tt.mad/Source/make_opts @@ -1,7 +1,6 @@ DEFAULT_CPP_COMPILER=g++ DEFAULT_F2PY_COMPILER=f2py3 DEFAULT_F_COMPILER=gfortran -#end_of_make_opts_variables GLOBAL_FLAG=-O3 -ffast-math -fbounds-check MACFLAG= MG5AMC_VERSION=SpecifiedByMG5aMCAtRunTime diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/banner.py b/epochX/cudacpp/gg_tt.mad/bin/internal/banner.py index 3995ce8109..bd1517985f 100755 --- a/epochX/cudacpp/gg_tt.mad/bin/internal/banner.py +++ b/epochX/cudacpp/gg_tt.mad/bin/internal/banner.py @@ -2822,13 +2822,15 @@ def add_param(self, name, value, fortran_name=None, include=True, if fct_mod: self.fct_mod[name] = fct_mod - def read(self, finput, consistency=True, unknown_warning=True): + def read(self, finput, consistency=True, unknown_warning=True, **opt): """Read the input file, this can be a path to a file, a file object, a str with the content of the file.""" if isinstance(finput, str): if "\n" in finput: finput = finput.split('\n') + if 'path' in opt: + self.path = opt['path'] elif os.path.isfile(finput): self.path = finput finput = open(finput) @@ -3913,7 +3915,7 @@ def remove_all_cut(self): %(global_flag)s = global_flag ! fortran optimization flag use for the all code. %(aloha_flag)s = aloha_flag ! fortran optimization flag for aloha function. Suggestions: '-ffast-math' %(matrix_flag)s = matrix_flag ! fortran optimization flag for matrix.f function. Suggestions: '-O3' - %(vector_size)s = vector_size ! size designed for SIMD/OpenMP/GPU (number of events in lockstep) + %(vector_size)s = vector_size ! size of fortran arrays allocated in the multi-event API for SIMD/GPU (VECSIZE_MEMMAX) """ template_off = '# To see advanced option for Phase-Space optimization: type "update psoptim"' diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/gg_tt.mad/bin/internal/launch_plugin.py index 0b849330ef..3b09713e12 100644 --- a/epochX/cudacpp/gg_tt.mad/bin/internal/launch_plugin.py +++ b/epochX/cudacpp/gg_tt.mad/bin/internal/launch_plugin.py @@ -16,22 +16,28 @@ import internal.misc as misc import internal.extended_cmd as extended_cmd import internal.banner as banner_mod + import internal.common_run_interface as common_run_interface else: import madgraph.interface.madevent_interface as madevent_interface import madgraph.various.misc as misc import madgraph.interface.extended_cmd as extended_cmd import madgraph.various.banner as banner_mod + import madgraph.interface.common_run_interface as common_run_interface class CPPMEInterface(madevent_interface.MadEventCmdShell): def compile(self, *args, **opts): """ """ import multiprocessing if not self.options['nb_core'] or self.options['nb_core'] == 'None': - self.options['nb_core'] = multiprocessing.cpu_count() - if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'): - import pathlib - import os - pjoin = os.path.join + self.options['nb_core'] = multiprocessing.cpu_count() + if 'cwd' in opts and os.path.basename(opts['cwd']) == 'Source': + path = pjoin(opts['cwd'], 'make_opts') + avx_level = self.run_card['avx_level'] if self.run_card['avx_level'] != 'auto' else '' + common_run_interface.CommonRunCmd.update_make_opts_full(path, + {'FPTYPE': self.run_card['floating_type'], + 'AVX': avx_level }) + misc.sprint('FPTYPE checked') + if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'): cudacpp_backend = self.run_card['cudacpp_backend'].upper() # the default value is defined in banner.py logger.info("Building madevent in madevent_interface.py with '%s' matrix elements"%cudacpp_backend) if cudacpp_backend == 'FORTRAN': @@ -46,30 +52,83 @@ def compile(self, *args, **opts): else: return misc.compile(nb_core=self.options['nb_core'], *args, **opts) +# Phase-Space Optimization ------------------------------------------------------------------------------------ +template_on = \ +"""#*********************************************************************** +# SIMD/GPU configuration for the CUDACPP plugin +#************************************************************************ + %(floating_type)s = floating_type ! floating point precision: f (single), d (double), m (mixed: double for amplitudes, single for colors) + %(avx_level)s = avx_level ! SIMD vectorization level: none, sse4, avx2, 512y, 512z, auto + %(cudacpp_backend)s = cudacpp_backend ! CUDACPP backend: FORTRAN, CPP, CUDA +""" + +template_off = '' +plugin_block = banner_mod.RunBlock('simd', template_on=template_on, template_off=template_off) + class CPPRunCard(banner_mod.RunCardLO): + blocks = banner_mod.RunCardLO.blocks + [plugin_block] + def reset_simd(self, old_value, new_value, name): if not hasattr(self, 'path'): - logger.warning('WARNING! CPPRunCard instance has no attribute path') - return - ###raise Exception('INTERNAL ERROR! CPPRunCard instance has no attribute path') + raise Exception('INTERNAL ERROR! CPPRunCard instance has no attribute path') # now ok after fixing #790 if name == "vector_size" and new_value <= int(old_value): # code can handle the new size -> do not recompile return Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source') subprocess.call(['make', 'cleanavx'], cwd=Sourcedir, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) + def reset_makeopts(self, old_value, new_value, name): + if not hasattr(self, 'path'): + raise Exception + avx_value = self['avx_level'] if self['avx_level'] != 'auto' else '' + if name == 'floating_type': + common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': new_value, 'AVX': avx_value}) + elif name == 'avx_level': + if new_value == 'auto': new_value = '' + common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': self['floating_type'], 'AVX': new_value}) + else: + raise Exception + Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source') + subprocess.call(['make', 'cleanavx'], cwd=Sourcedir, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) + def plugin_input(self, finput): return def default_setup(self): super().default_setup() - self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False) + self.add_param('floating_type', 'd', include=False, hidden=False, + fct_mod=(self.reset_makeopts,(),{}), + allowed=['m','d','f']) + self.add_param('avx_level', 'auto', include=False, hidden=False, + fct_mod=(self.reset_makeopts,(),{}), + allowed=['auto', 'none', 'sse4', 'avx2','512y','512z']) + self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False, + allowed=['Fortan', 'CPP', 'CUDA']) + self['vector_size'] = 16 # already setup in default class (just change value) + self['aloha_flag'] = '--fast-math' + self['matrix_flag'] = '-O3' + self.display_block.append('simd') + self.display_block.append('psoptim') + # OM/AV - overload the default version in banner.py def write_one_include_file(self, output_dir, incname, output_file=None): """write one include file at the time""" - if incname == "vector.inc" and 'vector_size' not in self.user_set: - return - super().write_one_include_file(output_dir, incname, output_file) + if incname == "vector.inc": + if 'vector_size' not in self.user_set: return + if output_file is None: vectorinc=pjoin(output_dir,incname) + else: vectorinc=output_file + with open(vectorinc+'.new','w') as fileout: + with open(vectorinc) as filein: + for line in filein: + if line.startswith('C'): fileout.write(line) + super().write_one_include_file(output_dir, incname, output_file) + with open(vectorinc+'.new','a') as fileout: + with open(vectorinc) as filein: + for line in filein: + if not line.startswith('\n'): fileout.write(line) + os.replace(vectorinc+'.new',vectorinc) + else: + super().write_one_include_file(output_dir, incname, output_file) def check_validity(self): """ensure that PLUGIN information are consistent""" @@ -82,13 +141,10 @@ def check_validity(self): class GPURunCard(CPPRunCard): def default_setup(self): - super(CPPRunCard, self).default_setup() - self.add_param('cudacpp_backend', 'CUDA', include=False, hidden=False) - -#class CUDACPPRunCard(CPPRunCard): -# def default_setup(self): -# super(CPPRunCard, self).default_setup() -# self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False) + super().default_setup() + # change default value: + self['cudacpp_backend'] = 'CUDA' + self['vector_size'] = 16384 # already setup in default class (just change value) MEINTERFACE = CPPMEInterface RunCard = CPPRunCard diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/madevent_interface.py b/epochX/cudacpp/gg_tt.mad/bin/internal/madevent_interface.py index 853aabc98a..cb6bf4ca57 100755 --- a/epochX/cudacpp/gg_tt.mad/bin/internal/madevent_interface.py +++ b/epochX/cudacpp/gg_tt.mad/bin/internal/madevent_interface.py @@ -3191,7 +3191,7 @@ def do_treatcards(self, line, mode=None, opt=None): if mode in ['run', 'all']: if not hasattr(self, 'run_card'): - run_card = banner_mod.RunCard(opt['run_card']) + run_card = banner_mod.RunCard(opt['run_card'], path=pjoin(self.me_dir, 'Cards', 'run_card.dat')) else: run_card = self.run_card self.run_card = run_card diff --git a/epochX/cudacpp/gg_tt.mad/mg5.in b/epochX/cudacpp/gg_tt.mad/mg5.in index 7859bf9b80..b4b356fc51 100644 --- a/epochX/cudacpp/gg_tt.mad/mg5.in +++ b/epochX/cudacpp/gg_tt.mad/mg5.in @@ -1,4 +1,4 @@ set stdout_level DEBUG set zerowidth_tchannel F generate g g > t t~ -output madevent gg_tt.mad --hel_recycling=False --vector_size=16384 --me_exporter=standalone_cudacpp +output madevent gg_tt.mad --hel_recycling=False --vector_size=32 --me_exporter=standalone_cudacpp diff --git a/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt b/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt index 805df19bd9..f09b4fa669 100644 --- a/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt +++ b/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.00567626953125  +DEBUG: model prefixing takes 0.00550079345703125  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -161,26 +161,26 @@ output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_tt Load PLUGIN.CUDACPP_OUTPUT Output will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: cformat =  plugin [export_cpp.py at line 3071]  -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 155]  -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 160]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 160]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 165]  INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1 INFO: Processing color information for process: g g > t t~ @1 -DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 189]  -DEBUG: type(subproc_group)= [output.py at line 190]  -DEBUG: type(fortran_model)= [output.py at line 191]  -DEBUG: type(me)= me=0 [output.py at line 192]  +DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 194]  +DEBUG: type(subproc_group)= [output.py at line 195]  +DEBUG: type(fortran_model)= [output.py at line 196]  +DEBUG: type(me)= me=0 [output.py at line 197]  INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/./CPPProcess.h FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/./CPPProcess.cc INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/. Generated helas calls for 1 subprocesses (3 diagrams) in 0.006 s -DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 197]  +DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 2 routines in 0.143 s +ALOHA: aloha creates 2 routines in 0.146 s VVV1 FFV1 FFV1 @@ -193,9 +193,8 @@ FileWriter for / FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. -DEBUG: 'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) =  Entering PLUGIN_ProcessExporter.finalize False [output.py at line 206]  quit -real 0m3.529s -user 0m0.478s -sys 0m0.048s +real 0m0.583s +user 0m0.466s +sys 0m0.061s diff --git a/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt b/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt index 9d4dbd85f0..acacaf4036 100644 --- a/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt +++ b/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005400419235229492  +DEBUG: model prefixing takes 0.0055658817291259766  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -165,15 +165,15 @@ INFO: Trying process: g g > t t~ g WEIGHTED<=3 @2 INFO: Process has 16 diagrams 1 processes with 16 diagrams generated in 0.019 s Total: 2 processes with 19 diagrams -output madevent ../TMPOUT/CODEGEN_mad_gg_tt01g --hel_recycling=False --vector_size=16384 --me_exporter=standalone_cudacpp +output madevent ../TMPOUT/CODEGEN_mad_gg_tt01g --hel_recycling=False --vector_size=32 --me_exporter=standalone_cudacpp Load PLUGIN.CUDACPP_OUTPUT Addition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT Output will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: cformat =  standalone_cudacpp [export_cpp.py at line 3071]  -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 155]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 160]  INFO: initialize a new directory: CODEGEN_mad_gg_tt01g INFO: remove old information in CODEGEN_mad_gg_tt01g -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 160]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 165]  WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g  INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards  @@ -185,7 +185,7 @@ INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1 INFO: Processing color information for process: g g > t t~ @1 INFO: Creating files in directory P2_gg_ttxg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -194,15 +194,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: config_map =  [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0] [export_cpp.py at line 711]  DEBUG: subproc_number =  0 [export_cpp.py at line 712]  DEBUG: Done [export_cpp.py at line 713]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872]  INFO: Generating Feynman diagrams for Process: g g > t t~ g WEIGHTED<=3 @2 INFO: Finding symmetric diagrams for subprocess group gg_ttxg INFO: Creating files in directory P1_gg_ttx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -211,29 +211,29 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: config_map =  [1, 2, 3] [export_cpp.py at line 711]  DEBUG: subproc_number =  1 [export_cpp.py at line 712]  DEBUG: Done [export_cpp.py at line 713]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872]  INFO: Generating Feynman diagrams for Process: g g > t t~ WEIGHTED<=2 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttx Generated helas calls for 2 subprocesses (19 diagrams) in 0.042 s -Wrote files for 46 helas calls in 0.242 s +Wrote files for 46 helas calls in 0.245 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 5 routines in 0.324 s -DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 197]  +ALOHA: aloha creates 5 routines in 0.321 s +DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 10 routines in 0.308 s +ALOHA: aloha creates 10 routines in 0.304 s VVV1 VVV1 FFV1 @@ -257,7 +257,6 @@ save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CO INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate web pages -DEBUG: 'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) =  Entering PLUGIN_ProcessExporter.finalize True [output.py at line 206]  DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common patching file Source/genps.inc patching file Source/makefile @@ -277,16 +276,16 @@ Hunk #2 succeeded at 159 (offset 16 lines). Hunk #3 succeeded at 237 (offset 16 lines). Hunk #4 succeeded at 265 (offset 16 lines). Hunk #5 succeeded at 310 (offset 16 lines). -DEBUG: p.returncode =  0 [output.py at line 232]  +DEBUG: p.returncode =  0 [output.py at line 237]  Output to directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g done. Type "launch" to generate events from this process, or see /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/README Run "open index.html" to see more information about this process. quit -real 0m5.282s -user 0m2.049s -sys 0m0.227s +real 0m2.697s +user 0m2.018s +sys 0m0.247s ************************************************************ * * * W E L C O M E to * @@ -314,9 +313,6 @@ Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt No valid web browser found. Please set in ./input/mg5_configuration.txt treatcards run -run_card missed argument cudacpp_backend. Takes default: CPP -run_card missed argument cudacpp_backend. Takes default: CPP -WARNING! CPPRunCard instance has no attribute path quit INFO: launch in debug mode diff --git a/epochX/cudacpp/gg_tt01g.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/gg_tt01g.mad/Cards/proc_card_mg5.dat index d0845f65f5..06ea2195d8 100644 --- a/epochX/cudacpp/gg_tt01g.mad/Cards/proc_card_mg5.dat +++ b/epochX/cudacpp/gg_tt01g.mad/Cards/proc_card_mg5.dat @@ -47,4 +47,4 @@ define vl = ve vm vt define vl~ = ve~ vm~ vt~ add process g g > t t~ g output madevent ../TMPOUT/CODEGEN_mad_gg_tt01g --hel_recycling=False -\ --vector_size=16384 --me_exporter=standalone_cudacpp +-vector_size=32 --me_exporter=standalone_cudacpp diff --git a/epochX/cudacpp/gg_tt01g.mad/Cards/run_card.dat b/epochX/cudacpp/gg_tt01g.mad/Cards/run_card.dat index 3360456835..f7c6502eec 100644 --- a/epochX/cudacpp/gg_tt01g.mad/Cards/run_card.dat +++ b/epochX/cudacpp/gg_tt01g.mad/Cards/run_card.dat @@ -98,7 +98,23 @@ 1 = sde_strategy ! default integration strategy (hep-ph/2021.00773) ! 1 is old strategy (using amp square) ! 2 is new strategy (using only the denominator) -# To see advanced option for Phase-Space optimization: type "update psoptim" +#********************************************************************* +# Phase-Space Optim (advanced) +#********************************************************************* + 0 = job_strategy ! see appendix of 1507.00020 (page 26) + 0 = hard_survey ! force to have better estimate of the integral at survey for difficult mode like interference + -1.0 = tmin_for_channel ! limit the non-singular reach of --some-- channel of integration related to T-channel diagram (value between -1 and 0), -1 is no impact + -1 = survey_splitting ! for loop-induced control how many core are used at survey for the computation of a single iteration. + 2 = survey_nchannel_per_job ! control how many Channel are integrated inside a single job on cluster/multicore + -1 = refine_evt_by_job ! control the maximal number of events for the first iteration of the refine (larger means less jobs) +#********************************************************************* +# Compilation flag. +#********************************************************************* + -O3 -ffast-math -fbounds-check = global_flag ! build flags for all Fortran code (for a fair comparison to cudacpp; default is -O) + --fast-math = aloha_flag ! fortran optimization flag for aloha function. Suggestions: '-ffast-math' + -O3 = matrix_flag ! fortran optimization flag for matrix.f function. Suggestions: '-O3' + 16384 = vector_size ! size of fortran arrays allocated in the multi-event API for SIMD/GPU (VECSIZE_MEMMAX) + #********************************************************************* # Customization (custom cuts/scale/bias/...) * # list of files containing fortran function that overwrite default * @@ -161,12 +177,10 @@ systematics = systematics_program ! none, systematics [python], SysCalc [depreceted, C++] ['--mur=0.5,1,2', '--muf=0.5,1,2', '--pdf=errorset', '--alps=0.5,1,2'] = systematics_arguments ! see: https://cp3.irmp.ucl.ac.be/projects/madgraph/wiki/Systematics#Systematicspythonmodule -#********************************************************************* -# Options for the cudacpp plugin -#********************************************************************* - -# Set cudacpp-specific values of non-cudacpp-specific options --O3 -ffast-math -fbounds-check = global_flag ! build flags for Fortran code (for a fair comparison to cudacpp) +#*********************************************************************** +# SIMD/GPU configuration for the CUDACPP plugin +#************************************************************************ + d = floating_type ! floating point precision: f (single), d (double), m (mixed: double for amplitudes, single for colors) + auto = avx_level ! SIMD vectorization level: none, sse4, avx2, 512y, 512z, auto + CPP = cudacpp_backend ! CUDACPP backend: FORTRAN, CPP, CUDA -# New cudacpp-specific options (default values are defined in banner.py) -CPP = cudacpp_backend ! valid backends are FORTRAN, CPP, CUDA diff --git a/epochX/cudacpp/gg_tt01g.mad/Cards/run_card_default.dat b/epochX/cudacpp/gg_tt01g.mad/Cards/run_card_default.dat index 46378139c8..0e45d1a8c9 100644 --- a/epochX/cudacpp/gg_tt01g.mad/Cards/run_card_default.dat +++ b/epochX/cudacpp/gg_tt01g.mad/Cards/run_card_default.dat @@ -98,7 +98,23 @@ 2 = sde_strategy ! default integration strategy (hep-ph/2021.00773) ! 1 is old strategy (using amp square) ! 2 is new strategy (using only the denominator) -# To see advanced option for Phase-Space optimization: type "update psoptim" +#********************************************************************* +# Phase-Space Optim (advanced) +#********************************************************************* + 0 = job_strategy ! see appendix of 1507.00020 (page 26) + 0 = hard_survey ! force to have better estimate of the integral at survey for difficult mode like interference + -1.0 = tmin_for_channel ! limit the non-singular reach of --some-- channel of integration related to T-channel diagram (value between -1 and 0), -1 is no impact + -1 = survey_splitting ! for loop-induced control how many core are used at survey for the computation of a single iteration. + 2 = survey_nchannel_per_job ! control how many Channel are integrated inside a single job on cluster/multicore + -1 = refine_evt_by_job ! control the maximal number of events for the first iteration of the refine (larger means less jobs) +#********************************************************************* +# Compilation flag. +#********************************************************************* + -O = global_flag ! fortran optimization flag use for the all code. + --fast-math = aloha_flag ! fortran optimization flag for aloha function. Suggestions: '-ffast-math' + -O3 = matrix_flag ! fortran optimization flag for matrix.f function. Suggestions: '-O3' + 16 = vector_size ! size of fortran arrays allocated in the multi-event API for SIMD/GPU (VECSIZE_MEMMAX) + #********************************************************************* # Customization (custom cuts/scale/bias/...) * # list of files containing fortran function that overwrite default * @@ -160,3 +176,11 @@ # systematics = systematics_program ! none, systematics [python], SysCalc [depreceted, C++] ['--mur=0.5,1,2', '--muf=0.5,1,2', '--pdf=errorset', '--alps=0.5,1,2'] = systematics_arguments ! see: https://cp3.irmp.ucl.ac.be/projects/madgraph/wiki/Systematics#Systematicspythonmodule + +#*********************************************************************** +# SIMD/GPU configuration for the CUDACPP plugin +#************************************************************************ + d = floating_type ! floating point precision: f (single), d (double), m (mixed: double for amplitudes, single for colors) + auto = avx_level ! SIMD vectorization level: none, sse4, avx2, 512y, 512z, auto + CPP = cudacpp_backend ! CUDACPP backend: FORTRAN, CPP, CUDA + diff --git a/epochX/cudacpp/gg_tt01g.mad/Source/make_opts b/epochX/cudacpp/gg_tt01g.mad/Source/make_opts index 6409e99a49..e4b87ee6ad 100644 --- a/epochX/cudacpp/gg_tt01g.mad/Source/make_opts +++ b/epochX/cudacpp/gg_tt01g.mad/Source/make_opts @@ -1,7 +1,6 @@ DEFAULT_CPP_COMPILER=g++ DEFAULT_F2PY_COMPILER=f2py3 DEFAULT_F_COMPILER=gfortran -#end_of_make_opts_variables GLOBAL_FLAG=-O3 -ffast-math -fbounds-check MACFLAG= MG5AMC_VERSION=SpecifiedByMG5aMCAtRunTime diff --git a/epochX/cudacpp/gg_tt01g.mad/bin/internal/banner.py b/epochX/cudacpp/gg_tt01g.mad/bin/internal/banner.py index 3995ce8109..bd1517985f 100755 --- a/epochX/cudacpp/gg_tt01g.mad/bin/internal/banner.py +++ b/epochX/cudacpp/gg_tt01g.mad/bin/internal/banner.py @@ -2822,13 +2822,15 @@ def add_param(self, name, value, fortran_name=None, include=True, if fct_mod: self.fct_mod[name] = fct_mod - def read(self, finput, consistency=True, unknown_warning=True): + def read(self, finput, consistency=True, unknown_warning=True, **opt): """Read the input file, this can be a path to a file, a file object, a str with the content of the file.""" if isinstance(finput, str): if "\n" in finput: finput = finput.split('\n') + if 'path' in opt: + self.path = opt['path'] elif os.path.isfile(finput): self.path = finput finput = open(finput) @@ -3913,7 +3915,7 @@ def remove_all_cut(self): %(global_flag)s = global_flag ! fortran optimization flag use for the all code. %(aloha_flag)s = aloha_flag ! fortran optimization flag for aloha function. Suggestions: '-ffast-math' %(matrix_flag)s = matrix_flag ! fortran optimization flag for matrix.f function. Suggestions: '-O3' - %(vector_size)s = vector_size ! size designed for SIMD/OpenMP/GPU (number of events in lockstep) + %(vector_size)s = vector_size ! size of fortran arrays allocated in the multi-event API for SIMD/GPU (VECSIZE_MEMMAX) """ template_off = '# To see advanced option for Phase-Space optimization: type "update psoptim"' diff --git a/epochX/cudacpp/gg_tt01g.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/gg_tt01g.mad/bin/internal/launch_plugin.py index 0b849330ef..3b09713e12 100644 --- a/epochX/cudacpp/gg_tt01g.mad/bin/internal/launch_plugin.py +++ b/epochX/cudacpp/gg_tt01g.mad/bin/internal/launch_plugin.py @@ -16,22 +16,28 @@ import internal.misc as misc import internal.extended_cmd as extended_cmd import internal.banner as banner_mod + import internal.common_run_interface as common_run_interface else: import madgraph.interface.madevent_interface as madevent_interface import madgraph.various.misc as misc import madgraph.interface.extended_cmd as extended_cmd import madgraph.various.banner as banner_mod + import madgraph.interface.common_run_interface as common_run_interface class CPPMEInterface(madevent_interface.MadEventCmdShell): def compile(self, *args, **opts): """ """ import multiprocessing if not self.options['nb_core'] or self.options['nb_core'] == 'None': - self.options['nb_core'] = multiprocessing.cpu_count() - if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'): - import pathlib - import os - pjoin = os.path.join + self.options['nb_core'] = multiprocessing.cpu_count() + if 'cwd' in opts and os.path.basename(opts['cwd']) == 'Source': + path = pjoin(opts['cwd'], 'make_opts') + avx_level = self.run_card['avx_level'] if self.run_card['avx_level'] != 'auto' else '' + common_run_interface.CommonRunCmd.update_make_opts_full(path, + {'FPTYPE': self.run_card['floating_type'], + 'AVX': avx_level }) + misc.sprint('FPTYPE checked') + if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'): cudacpp_backend = self.run_card['cudacpp_backend'].upper() # the default value is defined in banner.py logger.info("Building madevent in madevent_interface.py with '%s' matrix elements"%cudacpp_backend) if cudacpp_backend == 'FORTRAN': @@ -46,30 +52,83 @@ def compile(self, *args, **opts): else: return misc.compile(nb_core=self.options['nb_core'], *args, **opts) +# Phase-Space Optimization ------------------------------------------------------------------------------------ +template_on = \ +"""#*********************************************************************** +# SIMD/GPU configuration for the CUDACPP plugin +#************************************************************************ + %(floating_type)s = floating_type ! floating point precision: f (single), d (double), m (mixed: double for amplitudes, single for colors) + %(avx_level)s = avx_level ! SIMD vectorization level: none, sse4, avx2, 512y, 512z, auto + %(cudacpp_backend)s = cudacpp_backend ! CUDACPP backend: FORTRAN, CPP, CUDA +""" + +template_off = '' +plugin_block = banner_mod.RunBlock('simd', template_on=template_on, template_off=template_off) + class CPPRunCard(banner_mod.RunCardLO): + blocks = banner_mod.RunCardLO.blocks + [plugin_block] + def reset_simd(self, old_value, new_value, name): if not hasattr(self, 'path'): - logger.warning('WARNING! CPPRunCard instance has no attribute path') - return - ###raise Exception('INTERNAL ERROR! CPPRunCard instance has no attribute path') + raise Exception('INTERNAL ERROR! CPPRunCard instance has no attribute path') # now ok after fixing #790 if name == "vector_size" and new_value <= int(old_value): # code can handle the new size -> do not recompile return Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source') subprocess.call(['make', 'cleanavx'], cwd=Sourcedir, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) + def reset_makeopts(self, old_value, new_value, name): + if not hasattr(self, 'path'): + raise Exception + avx_value = self['avx_level'] if self['avx_level'] != 'auto' else '' + if name == 'floating_type': + common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': new_value, 'AVX': avx_value}) + elif name == 'avx_level': + if new_value == 'auto': new_value = '' + common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': self['floating_type'], 'AVX': new_value}) + else: + raise Exception + Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source') + subprocess.call(['make', 'cleanavx'], cwd=Sourcedir, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) + def plugin_input(self, finput): return def default_setup(self): super().default_setup() - self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False) + self.add_param('floating_type', 'd', include=False, hidden=False, + fct_mod=(self.reset_makeopts,(),{}), + allowed=['m','d','f']) + self.add_param('avx_level', 'auto', include=False, hidden=False, + fct_mod=(self.reset_makeopts,(),{}), + allowed=['auto', 'none', 'sse4', 'avx2','512y','512z']) + self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False, + allowed=['Fortan', 'CPP', 'CUDA']) + self['vector_size'] = 16 # already setup in default class (just change value) + self['aloha_flag'] = '--fast-math' + self['matrix_flag'] = '-O3' + self.display_block.append('simd') + self.display_block.append('psoptim') + # OM/AV - overload the default version in banner.py def write_one_include_file(self, output_dir, incname, output_file=None): """write one include file at the time""" - if incname == "vector.inc" and 'vector_size' not in self.user_set: - return - super().write_one_include_file(output_dir, incname, output_file) + if incname == "vector.inc": + if 'vector_size' not in self.user_set: return + if output_file is None: vectorinc=pjoin(output_dir,incname) + else: vectorinc=output_file + with open(vectorinc+'.new','w') as fileout: + with open(vectorinc) as filein: + for line in filein: + if line.startswith('C'): fileout.write(line) + super().write_one_include_file(output_dir, incname, output_file) + with open(vectorinc+'.new','a') as fileout: + with open(vectorinc) as filein: + for line in filein: + if not line.startswith('\n'): fileout.write(line) + os.replace(vectorinc+'.new',vectorinc) + else: + super().write_one_include_file(output_dir, incname, output_file) def check_validity(self): """ensure that PLUGIN information are consistent""" @@ -82,13 +141,10 @@ def check_validity(self): class GPURunCard(CPPRunCard): def default_setup(self): - super(CPPRunCard, self).default_setup() - self.add_param('cudacpp_backend', 'CUDA', include=False, hidden=False) - -#class CUDACPPRunCard(CPPRunCard): -# def default_setup(self): -# super(CPPRunCard, self).default_setup() -# self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False) + super().default_setup() + # change default value: + self['cudacpp_backend'] = 'CUDA' + self['vector_size'] = 16384 # already setup in default class (just change value) MEINTERFACE = CPPMEInterface RunCard = CPPRunCard diff --git a/epochX/cudacpp/gg_tt01g.mad/bin/internal/madevent_interface.py b/epochX/cudacpp/gg_tt01g.mad/bin/internal/madevent_interface.py index 853aabc98a..cb6bf4ca57 100755 --- a/epochX/cudacpp/gg_tt01g.mad/bin/internal/madevent_interface.py +++ b/epochX/cudacpp/gg_tt01g.mad/bin/internal/madevent_interface.py @@ -3191,7 +3191,7 @@ def do_treatcards(self, line, mode=None, opt=None): if mode in ['run', 'all']: if not hasattr(self, 'run_card'): - run_card = banner_mod.RunCard(opt['run_card']) + run_card = banner_mod.RunCard(opt['run_card'], path=pjoin(self.me_dir, 'Cards', 'run_card.dat')) else: run_card = self.run_card self.run_card = run_card diff --git a/epochX/cudacpp/gg_tt01g.mad/mg5.in b/epochX/cudacpp/gg_tt01g.mad/mg5.in index a20e166e81..95984fcf10 100644 --- a/epochX/cudacpp/gg_tt01g.mad/mg5.in +++ b/epochX/cudacpp/gg_tt01g.mad/mg5.in @@ -2,4 +2,4 @@ set stdout_level DEBUG set zerowidth_tchannel F generate g g > t t~ add process g g > t t~ g -output madevent gg_tt01g.mad --hel_recycling=False --vector_size=16384 --me_exporter=standalone_cudacpp +output madevent gg_tt01g.mad --hel_recycling=False --vector_size=32 --me_exporter=standalone_cudacpp diff --git a/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt b/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt index 68afa8d9b0..b52dc31122 100644 --- a/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt +++ b/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005378007888793945  +DEBUG: model prefixing takes 0.0053288936614990234  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -157,15 +157,15 @@ INFO: Trying process: g g > t t~ g WEIGHTED<=3 @1 INFO: Process has 16 diagrams 1 processes with 16 diagrams generated in 0.021 s Total: 1 processes with 16 diagrams -output madevent ../TMPOUT/CODEGEN_mad_gg_ttg --hel_recycling=False --vector_size=16384 --me_exporter=standalone_cudacpp +output madevent ../TMPOUT/CODEGEN_mad_gg_ttg --hel_recycling=False --vector_size=32 --me_exporter=standalone_cudacpp Load PLUGIN.CUDACPP_OUTPUT Addition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT Output will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: cformat =  standalone_cudacpp [export_cpp.py at line 3071]  -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 155]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 160]  INFO: initialize a new directory: CODEGEN_mad_gg_ttg INFO: remove old information in CODEGEN_mad_gg_ttg -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 160]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 165]  WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg  INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards  @@ -175,7 +175,7 @@ INFO: Generating Helas calls for process: g g > t t~ g WEIGHTED<=3 @1 INFO: Processing color information for process: g g > t t~ g @1 INFO: Creating files in directory P1_gg_ttxg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -184,29 +184,29 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: config_map =  [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0] [export_cpp.py at line 711]  DEBUG: subproc_number =  0 [export_cpp.py at line 712]  DEBUG: Done [export_cpp.py at line 713]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872]  INFO: Generating Feynman diagrams for Process: g g > t t~ g WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttxg -Generated helas calls for 1 subprocesses (16 diagrams) in 0.038 s -Wrote files for 36 helas calls in 0.148 s +Generated helas calls for 1 subprocesses (16 diagrams) in 0.037 s +Wrote files for 36 helas calls in 0.155 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 5 routines in 0.323 s -DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 197]  +ALOHA: aloha creates 5 routines in 0.326 s +DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 10 routines in 0.309 s +ALOHA: aloha creates 10 routines in 0.327 s VVV1 VVV1 FFV1 @@ -230,7 +230,6 @@ save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CO INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate web pages -DEBUG: 'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) =  Entering PLUGIN_ProcessExporter.finalize True [output.py at line 206]  DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common patching file Source/genps.inc patching file Source/makefile @@ -246,16 +245,16 @@ Hunk #2 succeeded at 159 (offset 16 lines). Hunk #3 succeeded at 237 (offset 16 lines). Hunk #4 succeeded at 265 (offset 16 lines). Hunk #5 succeeded at 310 (offset 16 lines). -DEBUG: p.returncode =  0 [output.py at line 232]  +DEBUG: p.returncode =  0 [output.py at line 237]  Output to directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg done. Type "launch" to generate events from this process, or see /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/README Run "open index.html" to see more information about this process. quit -real 0m5.147s -user 0m1.924s -sys 0m0.225s +real 0m2.189s +user 0m1.943s +sys 0m0.223s ************************************************************ * * * W E L C O M E to * @@ -283,9 +282,6 @@ Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt No valid web browser found. Please set in ./input/mg5_configuration.txt treatcards run -run_card missed argument cudacpp_backend. Takes default: CPP -run_card missed argument cudacpp_backend. Takes default: CPP -WARNING! CPPRunCard instance has no attribute path quit INFO: launch in debug mode diff --git a/epochX/cudacpp/gg_ttg.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/gg_ttg.mad/Cards/proc_card_mg5.dat index a0ffbbc219..9d09090869 100644 --- a/epochX/cudacpp/gg_ttg.mad/Cards/proc_card_mg5.dat +++ b/epochX/cudacpp/gg_ttg.mad/Cards/proc_card_mg5.dat @@ -46,4 +46,4 @@ define l- = e- mu- define vl = ve vm vt define vl~ = ve~ vm~ vt~ output madevent ../TMPOUT/CODEGEN_mad_gg_ttg --hel_recycling=False --v\ -ector_size=16384 --me_exporter=standalone_cudacpp +ector_size=32 --me_exporter=standalone_cudacpp diff --git a/epochX/cudacpp/gg_ttg.mad/Cards/run_card.dat b/epochX/cudacpp/gg_ttg.mad/Cards/run_card.dat index a9fbb5a212..f119b5a1c7 100644 --- a/epochX/cudacpp/gg_ttg.mad/Cards/run_card.dat +++ b/epochX/cudacpp/gg_ttg.mad/Cards/run_card.dat @@ -80,7 +80,23 @@ 1 = sde_strategy ! default integration strategy (hep-ph/2021.00773) ! 1 is old strategy (using amp square) ! 2 is new strategy (using only the denominator) -# To see advanced option for Phase-Space optimization: type "update psoptim" +#********************************************************************* +# Phase-Space Optim (advanced) +#********************************************************************* + 0 = job_strategy ! see appendix of 1507.00020 (page 26) + 0 = hard_survey ! force to have better estimate of the integral at survey for difficult mode like interference + -1.0 = tmin_for_channel ! limit the non-singular reach of --some-- channel of integration related to T-channel diagram (value between -1 and 0), -1 is no impact + -1 = survey_splitting ! for loop-induced control how many core are used at survey for the computation of a single iteration. + 2 = survey_nchannel_per_job ! control how many Channel are integrated inside a single job on cluster/multicore + -1 = refine_evt_by_job ! control the maximal number of events for the first iteration of the refine (larger means less jobs) +#********************************************************************* +# Compilation flag. +#********************************************************************* + -O3 -ffast-math -fbounds-check = global_flag ! build flags for all Fortran code (for a fair comparison to cudacpp; default is -O) + --fast-math = aloha_flag ! fortran optimization flag for aloha function. Suggestions: '-ffast-math' + -O3 = matrix_flag ! fortran optimization flag for matrix.f function. Suggestions: '-O3' + 16384 = vector_size ! size of fortran arrays allocated in the multi-event API for SIMD/GPU (VECSIZE_MEMMAX) + #********************************************************************* # Customization (custom cuts/scale/bias/...) * # list of files containing fortran function that overwrite default * @@ -143,12 +159,10 @@ systematics = systematics_program ! none, systematics [python], SysCalc [depreceted, C++] ['--mur=0.5,1,2', '--muf=0.5,1,2', '--pdf=errorset'] = systematics_arguments ! see: https://cp3.irmp.ucl.ac.be/projects/madgraph/wiki/Systematics#Systematicspythonmodule -#********************************************************************* -# Options for the cudacpp plugin -#********************************************************************* - -# Set cudacpp-specific values of non-cudacpp-specific options --O3 -ffast-math -fbounds-check = global_flag ! build flags for Fortran code (for a fair comparison to cudacpp) +#*********************************************************************** +# SIMD/GPU configuration for the CUDACPP plugin +#************************************************************************ + d = floating_type ! floating point precision: f (single), d (double), m (mixed: double for amplitudes, single for colors) + auto = avx_level ! SIMD vectorization level: none, sse4, avx2, 512y, 512z, auto + CPP = cudacpp_backend ! CUDACPP backend: FORTRAN, CPP, CUDA -# New cudacpp-specific options (default values are defined in banner.py) -CPP = cudacpp_backend ! valid backends are FORTRAN, CPP, CUDA diff --git a/epochX/cudacpp/gg_ttg.mad/Cards/run_card_default.dat b/epochX/cudacpp/gg_ttg.mad/Cards/run_card_default.dat index 1464f610ba..e89c78702d 100644 --- a/epochX/cudacpp/gg_ttg.mad/Cards/run_card_default.dat +++ b/epochX/cudacpp/gg_ttg.mad/Cards/run_card_default.dat @@ -80,7 +80,23 @@ 2 = sde_strategy ! default integration strategy (hep-ph/2021.00773) ! 1 is old strategy (using amp square) ! 2 is new strategy (using only the denominator) -# To see advanced option for Phase-Space optimization: type "update psoptim" +#********************************************************************* +# Phase-Space Optim (advanced) +#********************************************************************* + 0 = job_strategy ! see appendix of 1507.00020 (page 26) + 0 = hard_survey ! force to have better estimate of the integral at survey for difficult mode like interference + -1.0 = tmin_for_channel ! limit the non-singular reach of --some-- channel of integration related to T-channel diagram (value between -1 and 0), -1 is no impact + -1 = survey_splitting ! for loop-induced control how many core are used at survey for the computation of a single iteration. + 2 = survey_nchannel_per_job ! control how many Channel are integrated inside a single job on cluster/multicore + -1 = refine_evt_by_job ! control the maximal number of events for the first iteration of the refine (larger means less jobs) +#********************************************************************* +# Compilation flag. +#********************************************************************* + -O = global_flag ! fortran optimization flag use for the all code. + --fast-math = aloha_flag ! fortran optimization flag for aloha function. Suggestions: '-ffast-math' + -O3 = matrix_flag ! fortran optimization flag for matrix.f function. Suggestions: '-O3' + 16 = vector_size ! size of fortran arrays allocated in the multi-event API for SIMD/GPU (VECSIZE_MEMMAX) + #********************************************************************* # Customization (custom cuts/scale/bias/...) * # list of files containing fortran function that overwrite default * @@ -142,3 +158,11 @@ # systematics = systematics_program ! none, systematics [python], SysCalc [depreceted, C++] ['--mur=0.5,1,2', '--muf=0.5,1,2', '--pdf=errorset'] = systematics_arguments ! see: https://cp3.irmp.ucl.ac.be/projects/madgraph/wiki/Systematics#Systematicspythonmodule + +#*********************************************************************** +# SIMD/GPU configuration for the CUDACPP plugin +#************************************************************************ + d = floating_type ! floating point precision: f (single), d (double), m (mixed: double for amplitudes, single for colors) + auto = avx_level ! SIMD vectorization level: none, sse4, avx2, 512y, 512z, auto + CPP = cudacpp_backend ! CUDACPP backend: FORTRAN, CPP, CUDA + diff --git a/epochX/cudacpp/gg_ttg.mad/Source/make_opts b/epochX/cudacpp/gg_ttg.mad/Source/make_opts index 6409e99a49..e4b87ee6ad 100644 --- a/epochX/cudacpp/gg_ttg.mad/Source/make_opts +++ b/epochX/cudacpp/gg_ttg.mad/Source/make_opts @@ -1,7 +1,6 @@ DEFAULT_CPP_COMPILER=g++ DEFAULT_F2PY_COMPILER=f2py3 DEFAULT_F_COMPILER=gfortran -#end_of_make_opts_variables GLOBAL_FLAG=-O3 -ffast-math -fbounds-check MACFLAG= MG5AMC_VERSION=SpecifiedByMG5aMCAtRunTime diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/banner.py b/epochX/cudacpp/gg_ttg.mad/bin/internal/banner.py index 3995ce8109..bd1517985f 100755 --- a/epochX/cudacpp/gg_ttg.mad/bin/internal/banner.py +++ b/epochX/cudacpp/gg_ttg.mad/bin/internal/banner.py @@ -2822,13 +2822,15 @@ def add_param(self, name, value, fortran_name=None, include=True, if fct_mod: self.fct_mod[name] = fct_mod - def read(self, finput, consistency=True, unknown_warning=True): + def read(self, finput, consistency=True, unknown_warning=True, **opt): """Read the input file, this can be a path to a file, a file object, a str with the content of the file.""" if isinstance(finput, str): if "\n" in finput: finput = finput.split('\n') + if 'path' in opt: + self.path = opt['path'] elif os.path.isfile(finput): self.path = finput finput = open(finput) @@ -3913,7 +3915,7 @@ def remove_all_cut(self): %(global_flag)s = global_flag ! fortran optimization flag use for the all code. %(aloha_flag)s = aloha_flag ! fortran optimization flag for aloha function. Suggestions: '-ffast-math' %(matrix_flag)s = matrix_flag ! fortran optimization flag for matrix.f function. Suggestions: '-O3' - %(vector_size)s = vector_size ! size designed for SIMD/OpenMP/GPU (number of events in lockstep) + %(vector_size)s = vector_size ! size of fortran arrays allocated in the multi-event API for SIMD/GPU (VECSIZE_MEMMAX) """ template_off = '# To see advanced option for Phase-Space optimization: type "update psoptim"' diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/gg_ttg.mad/bin/internal/launch_plugin.py index 0b849330ef..3b09713e12 100644 --- a/epochX/cudacpp/gg_ttg.mad/bin/internal/launch_plugin.py +++ b/epochX/cudacpp/gg_ttg.mad/bin/internal/launch_plugin.py @@ -16,22 +16,28 @@ import internal.misc as misc import internal.extended_cmd as extended_cmd import internal.banner as banner_mod + import internal.common_run_interface as common_run_interface else: import madgraph.interface.madevent_interface as madevent_interface import madgraph.various.misc as misc import madgraph.interface.extended_cmd as extended_cmd import madgraph.various.banner as banner_mod + import madgraph.interface.common_run_interface as common_run_interface class CPPMEInterface(madevent_interface.MadEventCmdShell): def compile(self, *args, **opts): """ """ import multiprocessing if not self.options['nb_core'] or self.options['nb_core'] == 'None': - self.options['nb_core'] = multiprocessing.cpu_count() - if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'): - import pathlib - import os - pjoin = os.path.join + self.options['nb_core'] = multiprocessing.cpu_count() + if 'cwd' in opts and os.path.basename(opts['cwd']) == 'Source': + path = pjoin(opts['cwd'], 'make_opts') + avx_level = self.run_card['avx_level'] if self.run_card['avx_level'] != 'auto' else '' + common_run_interface.CommonRunCmd.update_make_opts_full(path, + {'FPTYPE': self.run_card['floating_type'], + 'AVX': avx_level }) + misc.sprint('FPTYPE checked') + if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'): cudacpp_backend = self.run_card['cudacpp_backend'].upper() # the default value is defined in banner.py logger.info("Building madevent in madevent_interface.py with '%s' matrix elements"%cudacpp_backend) if cudacpp_backend == 'FORTRAN': @@ -46,30 +52,83 @@ def compile(self, *args, **opts): else: return misc.compile(nb_core=self.options['nb_core'], *args, **opts) +# Phase-Space Optimization ------------------------------------------------------------------------------------ +template_on = \ +"""#*********************************************************************** +# SIMD/GPU configuration for the CUDACPP plugin +#************************************************************************ + %(floating_type)s = floating_type ! floating point precision: f (single), d (double), m (mixed: double for amplitudes, single for colors) + %(avx_level)s = avx_level ! SIMD vectorization level: none, sse4, avx2, 512y, 512z, auto + %(cudacpp_backend)s = cudacpp_backend ! CUDACPP backend: FORTRAN, CPP, CUDA +""" + +template_off = '' +plugin_block = banner_mod.RunBlock('simd', template_on=template_on, template_off=template_off) + class CPPRunCard(banner_mod.RunCardLO): + blocks = banner_mod.RunCardLO.blocks + [plugin_block] + def reset_simd(self, old_value, new_value, name): if not hasattr(self, 'path'): - logger.warning('WARNING! CPPRunCard instance has no attribute path') - return - ###raise Exception('INTERNAL ERROR! CPPRunCard instance has no attribute path') + raise Exception('INTERNAL ERROR! CPPRunCard instance has no attribute path') # now ok after fixing #790 if name == "vector_size" and new_value <= int(old_value): # code can handle the new size -> do not recompile return Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source') subprocess.call(['make', 'cleanavx'], cwd=Sourcedir, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) + def reset_makeopts(self, old_value, new_value, name): + if not hasattr(self, 'path'): + raise Exception + avx_value = self['avx_level'] if self['avx_level'] != 'auto' else '' + if name == 'floating_type': + common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': new_value, 'AVX': avx_value}) + elif name == 'avx_level': + if new_value == 'auto': new_value = '' + common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': self['floating_type'], 'AVX': new_value}) + else: + raise Exception + Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source') + subprocess.call(['make', 'cleanavx'], cwd=Sourcedir, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) + def plugin_input(self, finput): return def default_setup(self): super().default_setup() - self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False) + self.add_param('floating_type', 'd', include=False, hidden=False, + fct_mod=(self.reset_makeopts,(),{}), + allowed=['m','d','f']) + self.add_param('avx_level', 'auto', include=False, hidden=False, + fct_mod=(self.reset_makeopts,(),{}), + allowed=['auto', 'none', 'sse4', 'avx2','512y','512z']) + self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False, + allowed=['Fortan', 'CPP', 'CUDA']) + self['vector_size'] = 16 # already setup in default class (just change value) + self['aloha_flag'] = '--fast-math' + self['matrix_flag'] = '-O3' + self.display_block.append('simd') + self.display_block.append('psoptim') + # OM/AV - overload the default version in banner.py def write_one_include_file(self, output_dir, incname, output_file=None): """write one include file at the time""" - if incname == "vector.inc" and 'vector_size' not in self.user_set: - return - super().write_one_include_file(output_dir, incname, output_file) + if incname == "vector.inc": + if 'vector_size' not in self.user_set: return + if output_file is None: vectorinc=pjoin(output_dir,incname) + else: vectorinc=output_file + with open(vectorinc+'.new','w') as fileout: + with open(vectorinc) as filein: + for line in filein: + if line.startswith('C'): fileout.write(line) + super().write_one_include_file(output_dir, incname, output_file) + with open(vectorinc+'.new','a') as fileout: + with open(vectorinc) as filein: + for line in filein: + if not line.startswith('\n'): fileout.write(line) + os.replace(vectorinc+'.new',vectorinc) + else: + super().write_one_include_file(output_dir, incname, output_file) def check_validity(self): """ensure that PLUGIN information are consistent""" @@ -82,13 +141,10 @@ def check_validity(self): class GPURunCard(CPPRunCard): def default_setup(self): - super(CPPRunCard, self).default_setup() - self.add_param('cudacpp_backend', 'CUDA', include=False, hidden=False) - -#class CUDACPPRunCard(CPPRunCard): -# def default_setup(self): -# super(CPPRunCard, self).default_setup() -# self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False) + super().default_setup() + # change default value: + self['cudacpp_backend'] = 'CUDA' + self['vector_size'] = 16384 # already setup in default class (just change value) MEINTERFACE = CPPMEInterface RunCard = CPPRunCard diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/madevent_interface.py b/epochX/cudacpp/gg_ttg.mad/bin/internal/madevent_interface.py index 853aabc98a..cb6bf4ca57 100755 --- a/epochX/cudacpp/gg_ttg.mad/bin/internal/madevent_interface.py +++ b/epochX/cudacpp/gg_ttg.mad/bin/internal/madevent_interface.py @@ -3191,7 +3191,7 @@ def do_treatcards(self, line, mode=None, opt=None): if mode in ['run', 'all']: if not hasattr(self, 'run_card'): - run_card = banner_mod.RunCard(opt['run_card']) + run_card = banner_mod.RunCard(opt['run_card'], path=pjoin(self.me_dir, 'Cards', 'run_card.dat')) else: run_card = self.run_card self.run_card = run_card diff --git a/epochX/cudacpp/gg_ttg.mad/mg5.in b/epochX/cudacpp/gg_ttg.mad/mg5.in index 98f53ce50d..e37d10d865 100644 --- a/epochX/cudacpp/gg_ttg.mad/mg5.in +++ b/epochX/cudacpp/gg_ttg.mad/mg5.in @@ -1,4 +1,4 @@ set stdout_level DEBUG set zerowidth_tchannel F generate g g > t t~ g -output madevent gg_ttg.mad --hel_recycling=False --vector_size=16384 --me_exporter=standalone_cudacpp +output madevent gg_ttg.mad --hel_recycling=False --vector_size=32 --me_exporter=standalone_cudacpp diff --git a/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt b/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt index 97056958fe..b9716683be 100644 --- a/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt +++ b/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005817890167236328  +DEBUG: model prefixing takes 0.005490303039550781  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -155,35 +155,35 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=3: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g WEIGHTED<=3 @1 INFO: Process has 16 diagrams -1 processes with 16 diagrams generated in 0.022 s +1 processes with 16 diagrams generated in 0.021 s Total: 1 processes with 16 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_ttg Load PLUGIN.CUDACPP_OUTPUT Output will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: cformat =  plugin [export_cpp.py at line 3071]  -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 155]  -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 160]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 160]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 165]  INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ g WEIGHTED<=3 @1 INFO: Processing color information for process: g g > t t~ g @1 -DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 189]  -DEBUG: type(subproc_group)= [output.py at line 190]  -DEBUG: type(fortran_model)= [output.py at line 191]  -DEBUG: type(me)= me=0 [output.py at line 192]  +DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 194]  +DEBUG: type(subproc_group)= [output.py at line 195]  +DEBUG: type(fortran_model)= [output.py at line 196]  +DEBUG: type(me)= me=0 [output.py at line 197]  INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/./CPPProcess.h FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/./CPPProcess.cc INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/. -Generated helas calls for 1 subprocesses (16 diagrams) in 0.038 s -DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 197]  +Generated helas calls for 1 subprocesses (16 diagrams) in 0.036 s +DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 5 routines in 0.326 s +ALOHA: aloha creates 5 routines in 0.322 s VVV1 VVV1 FFV1 @@ -201,9 +201,8 @@ FileWriter for / FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/. -DEBUG: 'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) =  Entering PLUGIN_ProcessExporter.finalize False [output.py at line 206]  quit -real 0m3.779s -user 0m0.713s -sys 0m0.062s +real 0m0.784s +user 0m0.699s +sys 0m0.063s diff --git a/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt b/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt index eacd7a356a..ee3d38dfb1 100644 --- a/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt +++ b/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ g g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.0053293704986572266  +DEBUG: model prefixing takes 0.00561833381652832  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -155,17 +155,17 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=4: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g g WEIGHTED<=4 @1 INFO: Process has 123 diagrams -1 processes with 123 diagrams generated in 0.157 s +1 processes with 123 diagrams generated in 0.154 s Total: 1 processes with 123 diagrams -output madevent ../TMPOUT/CODEGEN_mad_gg_ttgg --hel_recycling=False --vector_size=16384 --me_exporter=standalone_cudacpp +output madevent ../TMPOUT/CODEGEN_mad_gg_ttgg --hel_recycling=False --vector_size=32 --me_exporter=standalone_cudacpp Load PLUGIN.CUDACPP_OUTPUT Addition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT Output will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: cformat =  standalone_cudacpp [export_cpp.py at line 3071]  -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 155]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 160]  INFO: initialize a new directory: CODEGEN_mad_gg_ttgg INFO: remove old information in CODEGEN_mad_gg_ttgg -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 160]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 165]  WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg  INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards  @@ -175,7 +175,7 @@ INFO: Generating Helas calls for process: g g > t t~ g g WEIGHTED<=4 @1 INFO: Processing color information for process: g g > t t~ g g @1 INFO: Creating files in directory P1_gg_ttxgg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -184,29 +184,29 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: config_map =  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 0, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 0, 46, 47, 48, 49, 50, 51, 52, 53, 54, 0, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 0, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 0, 88, 89, 90, 91, 92, 93, 0, 94, 95, 96, 97, 98, 99, 0, 100, 101, 102, 103, 104, 105, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] [export_cpp.py at line 711]  DEBUG: subproc_number =  0 [export_cpp.py at line 712]  DEBUG: Done [export_cpp.py at line 713]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872]  INFO: Generating Feynman diagrams for Process: g g > t t~ g g WEIGHTED<=4 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttxgg -Generated helas calls for 1 subprocesses (123 diagrams) in 0.425 s -Wrote files for 222 helas calls in 0.691 s +Generated helas calls for 1 subprocesses (123 diagrams) in 0.420 s +Wrote files for 222 helas calls in 0.679 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.333 s -DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 197]  +ALOHA: aloha creates 5 routines in 0.323 s +DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 10 routines in 0.316 s +ALOHA: aloha creates 10 routines in 0.314 s VVV1 VVV1 FFV1 @@ -233,7 +233,6 @@ save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CO INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate web pages -DEBUG: 'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) =  Entering PLUGIN_ProcessExporter.finalize True [output.py at line 206]  DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common patching file Source/genps.inc patching file Source/makefile @@ -249,15 +248,15 @@ Hunk #2 succeeded at 191 (offset 48 lines). Hunk #3 succeeded at 269 (offset 48 lines). Hunk #4 succeeded at 297 (offset 48 lines). Hunk #5 succeeded at 342 (offset 48 lines). -DEBUG: p.returncode =  0 [output.py at line 232]  +DEBUG: p.returncode =  0 [output.py at line 237]  Output to directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg done. Type "launch" to generate events from this process, or see /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/README Run "open index.html" to see more information about this process. quit -real 0m6.262s -user 0m3.028s +real 0m3.226s +user 0m2.976s sys 0m0.232s ************************************************************ * * @@ -286,9 +285,6 @@ Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt No valid web browser found. Please set in ./input/mg5_configuration.txt treatcards run -run_card missed argument cudacpp_backend. Takes default: CPP -run_card missed argument cudacpp_backend. Takes default: CPP -WARNING! CPPRunCard instance has no attribute path quit INFO: launch in debug mode diff --git a/epochX/cudacpp/gg_ttgg.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/gg_ttgg.mad/Cards/proc_card_mg5.dat index b7568d1a73..ea9cfcde68 100644 --- a/epochX/cudacpp/gg_ttgg.mad/Cards/proc_card_mg5.dat +++ b/epochX/cudacpp/gg_ttgg.mad/Cards/proc_card_mg5.dat @@ -46,4 +46,4 @@ define l- = e- mu- define vl = ve vm vt define vl~ = ve~ vm~ vt~ output madevent ../TMPOUT/CODEGEN_mad_gg_ttgg --hel_recycling=False --\ -vector_size=16384 --me_exporter=standalone_cudacpp +vector_size=32 --me_exporter=standalone_cudacpp diff --git a/epochX/cudacpp/gg_ttgg.mad/Cards/run_card.dat b/epochX/cudacpp/gg_ttgg.mad/Cards/run_card.dat index 88d4377d71..42728a48f3 100644 --- a/epochX/cudacpp/gg_ttgg.mad/Cards/run_card.dat +++ b/epochX/cudacpp/gg_ttgg.mad/Cards/run_card.dat @@ -80,7 +80,23 @@ 1 = sde_strategy ! default integration strategy (hep-ph/2021.00773) ! 1 is old strategy (using amp square) ! 2 is new strategy (using only the denominator) -# To see advanced option for Phase-Space optimization: type "update psoptim" +#********************************************************************* +# Phase-Space Optim (advanced) +#********************************************************************* + 0 = job_strategy ! see appendix of 1507.00020 (page 26) + 0 = hard_survey ! force to have better estimate of the integral at survey for difficult mode like interference + -1.0 = tmin_for_channel ! limit the non-singular reach of --some-- channel of integration related to T-channel diagram (value between -1 and 0), -1 is no impact + -1 = survey_splitting ! for loop-induced control how many core are used at survey for the computation of a single iteration. + 2 = survey_nchannel_per_job ! control how many Channel are integrated inside a single job on cluster/multicore + -1 = refine_evt_by_job ! control the maximal number of events for the first iteration of the refine (larger means less jobs) +#********************************************************************* +# Compilation flag. +#********************************************************************* + -O3 -ffast-math -fbounds-check = global_flag ! build flags for all Fortran code (for a fair comparison to cudacpp; default is -O) + --fast-math = aloha_flag ! fortran optimization flag for aloha function. Suggestions: '-ffast-math' + -O3 = matrix_flag ! fortran optimization flag for matrix.f function. Suggestions: '-O3' + 16384 = vector_size ! size of fortran arrays allocated in the multi-event API for SIMD/GPU (VECSIZE_MEMMAX) + #********************************************************************* # Customization (custom cuts/scale/bias/...) * # list of files containing fortran function that overwrite default * @@ -168,12 +184,10 @@ systematics = systematics_program ! none, systematics [python], SysCalc [depreceted, C++] ['--mur=0.5,1,2', '--muf=0.5,1,2', '--pdf=errorset'] = systematics_arguments ! see: https://cp3.irmp.ucl.ac.be/projects/madgraph/wiki/Systematics#Systematicspythonmodule -#********************************************************************* -# Options for the cudacpp plugin -#********************************************************************* - -# Set cudacpp-specific values of non-cudacpp-specific options --O3 -ffast-math -fbounds-check = global_flag ! build flags for Fortran code (for a fair comparison to cudacpp) +#*********************************************************************** +# SIMD/GPU configuration for the CUDACPP plugin +#************************************************************************ + d = floating_type ! floating point precision: f (single), d (double), m (mixed: double for amplitudes, single for colors) + auto = avx_level ! SIMD vectorization level: none, sse4, avx2, 512y, 512z, auto + CPP = cudacpp_backend ! CUDACPP backend: FORTRAN, CPP, CUDA -# New cudacpp-specific options (default values are defined in banner.py) -CPP = cudacpp_backend ! valid backends are FORTRAN, CPP, CUDA diff --git a/epochX/cudacpp/gg_ttgg.mad/Cards/run_card_default.dat b/epochX/cudacpp/gg_ttgg.mad/Cards/run_card_default.dat index 26fa7f39dd..0374d7ba60 100644 --- a/epochX/cudacpp/gg_ttgg.mad/Cards/run_card_default.dat +++ b/epochX/cudacpp/gg_ttgg.mad/Cards/run_card_default.dat @@ -80,7 +80,23 @@ 2 = sde_strategy ! default integration strategy (hep-ph/2021.00773) ! 1 is old strategy (using amp square) ! 2 is new strategy (using only the denominator) -# To see advanced option for Phase-Space optimization: type "update psoptim" +#********************************************************************* +# Phase-Space Optim (advanced) +#********************************************************************* + 0 = job_strategy ! see appendix of 1507.00020 (page 26) + 0 = hard_survey ! force to have better estimate of the integral at survey for difficult mode like interference + -1.0 = tmin_for_channel ! limit the non-singular reach of --some-- channel of integration related to T-channel diagram (value between -1 and 0), -1 is no impact + -1 = survey_splitting ! for loop-induced control how many core are used at survey for the computation of a single iteration. + 2 = survey_nchannel_per_job ! control how many Channel are integrated inside a single job on cluster/multicore + -1 = refine_evt_by_job ! control the maximal number of events for the first iteration of the refine (larger means less jobs) +#********************************************************************* +# Compilation flag. +#********************************************************************* + -O = global_flag ! fortran optimization flag use for the all code. + --fast-math = aloha_flag ! fortran optimization flag for aloha function. Suggestions: '-ffast-math' + -O3 = matrix_flag ! fortran optimization flag for matrix.f function. Suggestions: '-O3' + 16 = vector_size ! size of fortran arrays allocated in the multi-event API for SIMD/GPU (VECSIZE_MEMMAX) + #********************************************************************* # Customization (custom cuts/scale/bias/...) * # list of files containing fortran function that overwrite default * @@ -167,3 +183,11 @@ # systematics = systematics_program ! none, systematics [python], SysCalc [depreceted, C++] ['--mur=0.5,1,2', '--muf=0.5,1,2', '--pdf=errorset'] = systematics_arguments ! see: https://cp3.irmp.ucl.ac.be/projects/madgraph/wiki/Systematics#Systematicspythonmodule + +#*********************************************************************** +# SIMD/GPU configuration for the CUDACPP plugin +#************************************************************************ + d = floating_type ! floating point precision: f (single), d (double), m (mixed: double for amplitudes, single for colors) + auto = avx_level ! SIMD vectorization level: none, sse4, avx2, 512y, 512z, auto + CPP = cudacpp_backend ! CUDACPP backend: FORTRAN, CPP, CUDA + diff --git a/epochX/cudacpp/gg_ttgg.mad/Source/make_opts b/epochX/cudacpp/gg_ttgg.mad/Source/make_opts index 6409e99a49..e4b87ee6ad 100644 --- a/epochX/cudacpp/gg_ttgg.mad/Source/make_opts +++ b/epochX/cudacpp/gg_ttgg.mad/Source/make_opts @@ -1,7 +1,6 @@ DEFAULT_CPP_COMPILER=g++ DEFAULT_F2PY_COMPILER=f2py3 DEFAULT_F_COMPILER=gfortran -#end_of_make_opts_variables GLOBAL_FLAG=-O3 -ffast-math -fbounds-check MACFLAG= MG5AMC_VERSION=SpecifiedByMG5aMCAtRunTime diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/banner.py b/epochX/cudacpp/gg_ttgg.mad/bin/internal/banner.py index 3995ce8109..bd1517985f 100755 --- a/epochX/cudacpp/gg_ttgg.mad/bin/internal/banner.py +++ b/epochX/cudacpp/gg_ttgg.mad/bin/internal/banner.py @@ -2822,13 +2822,15 @@ def add_param(self, name, value, fortran_name=None, include=True, if fct_mod: self.fct_mod[name] = fct_mod - def read(self, finput, consistency=True, unknown_warning=True): + def read(self, finput, consistency=True, unknown_warning=True, **opt): """Read the input file, this can be a path to a file, a file object, a str with the content of the file.""" if isinstance(finput, str): if "\n" in finput: finput = finput.split('\n') + if 'path' in opt: + self.path = opt['path'] elif os.path.isfile(finput): self.path = finput finput = open(finput) @@ -3913,7 +3915,7 @@ def remove_all_cut(self): %(global_flag)s = global_flag ! fortran optimization flag use for the all code. %(aloha_flag)s = aloha_flag ! fortran optimization flag for aloha function. Suggestions: '-ffast-math' %(matrix_flag)s = matrix_flag ! fortran optimization flag for matrix.f function. Suggestions: '-O3' - %(vector_size)s = vector_size ! size designed for SIMD/OpenMP/GPU (number of events in lockstep) + %(vector_size)s = vector_size ! size of fortran arrays allocated in the multi-event API for SIMD/GPU (VECSIZE_MEMMAX) """ template_off = '# To see advanced option for Phase-Space optimization: type "update psoptim"' diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/gg_ttgg.mad/bin/internal/launch_plugin.py index 0b849330ef..3b09713e12 100644 --- a/epochX/cudacpp/gg_ttgg.mad/bin/internal/launch_plugin.py +++ b/epochX/cudacpp/gg_ttgg.mad/bin/internal/launch_plugin.py @@ -16,22 +16,28 @@ import internal.misc as misc import internal.extended_cmd as extended_cmd import internal.banner as banner_mod + import internal.common_run_interface as common_run_interface else: import madgraph.interface.madevent_interface as madevent_interface import madgraph.various.misc as misc import madgraph.interface.extended_cmd as extended_cmd import madgraph.various.banner as banner_mod + import madgraph.interface.common_run_interface as common_run_interface class CPPMEInterface(madevent_interface.MadEventCmdShell): def compile(self, *args, **opts): """ """ import multiprocessing if not self.options['nb_core'] or self.options['nb_core'] == 'None': - self.options['nb_core'] = multiprocessing.cpu_count() - if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'): - import pathlib - import os - pjoin = os.path.join + self.options['nb_core'] = multiprocessing.cpu_count() + if 'cwd' in opts and os.path.basename(opts['cwd']) == 'Source': + path = pjoin(opts['cwd'], 'make_opts') + avx_level = self.run_card['avx_level'] if self.run_card['avx_level'] != 'auto' else '' + common_run_interface.CommonRunCmd.update_make_opts_full(path, + {'FPTYPE': self.run_card['floating_type'], + 'AVX': avx_level }) + misc.sprint('FPTYPE checked') + if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'): cudacpp_backend = self.run_card['cudacpp_backend'].upper() # the default value is defined in banner.py logger.info("Building madevent in madevent_interface.py with '%s' matrix elements"%cudacpp_backend) if cudacpp_backend == 'FORTRAN': @@ -46,30 +52,83 @@ def compile(self, *args, **opts): else: return misc.compile(nb_core=self.options['nb_core'], *args, **opts) +# Phase-Space Optimization ------------------------------------------------------------------------------------ +template_on = \ +"""#*********************************************************************** +# SIMD/GPU configuration for the CUDACPP plugin +#************************************************************************ + %(floating_type)s = floating_type ! floating point precision: f (single), d (double), m (mixed: double for amplitudes, single for colors) + %(avx_level)s = avx_level ! SIMD vectorization level: none, sse4, avx2, 512y, 512z, auto + %(cudacpp_backend)s = cudacpp_backend ! CUDACPP backend: FORTRAN, CPP, CUDA +""" + +template_off = '' +plugin_block = banner_mod.RunBlock('simd', template_on=template_on, template_off=template_off) + class CPPRunCard(banner_mod.RunCardLO): + blocks = banner_mod.RunCardLO.blocks + [plugin_block] + def reset_simd(self, old_value, new_value, name): if not hasattr(self, 'path'): - logger.warning('WARNING! CPPRunCard instance has no attribute path') - return - ###raise Exception('INTERNAL ERROR! CPPRunCard instance has no attribute path') + raise Exception('INTERNAL ERROR! CPPRunCard instance has no attribute path') # now ok after fixing #790 if name == "vector_size" and new_value <= int(old_value): # code can handle the new size -> do not recompile return Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source') subprocess.call(['make', 'cleanavx'], cwd=Sourcedir, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) + def reset_makeopts(self, old_value, new_value, name): + if not hasattr(self, 'path'): + raise Exception + avx_value = self['avx_level'] if self['avx_level'] != 'auto' else '' + if name == 'floating_type': + common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': new_value, 'AVX': avx_value}) + elif name == 'avx_level': + if new_value == 'auto': new_value = '' + common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': self['floating_type'], 'AVX': new_value}) + else: + raise Exception + Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source') + subprocess.call(['make', 'cleanavx'], cwd=Sourcedir, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) + def plugin_input(self, finput): return def default_setup(self): super().default_setup() - self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False) + self.add_param('floating_type', 'd', include=False, hidden=False, + fct_mod=(self.reset_makeopts,(),{}), + allowed=['m','d','f']) + self.add_param('avx_level', 'auto', include=False, hidden=False, + fct_mod=(self.reset_makeopts,(),{}), + allowed=['auto', 'none', 'sse4', 'avx2','512y','512z']) + self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False, + allowed=['Fortan', 'CPP', 'CUDA']) + self['vector_size'] = 16 # already setup in default class (just change value) + self['aloha_flag'] = '--fast-math' + self['matrix_flag'] = '-O3' + self.display_block.append('simd') + self.display_block.append('psoptim') + # OM/AV - overload the default version in banner.py def write_one_include_file(self, output_dir, incname, output_file=None): """write one include file at the time""" - if incname == "vector.inc" and 'vector_size' not in self.user_set: - return - super().write_one_include_file(output_dir, incname, output_file) + if incname == "vector.inc": + if 'vector_size' not in self.user_set: return + if output_file is None: vectorinc=pjoin(output_dir,incname) + else: vectorinc=output_file + with open(vectorinc+'.new','w') as fileout: + with open(vectorinc) as filein: + for line in filein: + if line.startswith('C'): fileout.write(line) + super().write_one_include_file(output_dir, incname, output_file) + with open(vectorinc+'.new','a') as fileout: + with open(vectorinc) as filein: + for line in filein: + if not line.startswith('\n'): fileout.write(line) + os.replace(vectorinc+'.new',vectorinc) + else: + super().write_one_include_file(output_dir, incname, output_file) def check_validity(self): """ensure that PLUGIN information are consistent""" @@ -82,13 +141,10 @@ def check_validity(self): class GPURunCard(CPPRunCard): def default_setup(self): - super(CPPRunCard, self).default_setup() - self.add_param('cudacpp_backend', 'CUDA', include=False, hidden=False) - -#class CUDACPPRunCard(CPPRunCard): -# def default_setup(self): -# super(CPPRunCard, self).default_setup() -# self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False) + super().default_setup() + # change default value: + self['cudacpp_backend'] = 'CUDA' + self['vector_size'] = 16384 # already setup in default class (just change value) MEINTERFACE = CPPMEInterface RunCard = CPPRunCard diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/madevent_interface.py b/epochX/cudacpp/gg_ttgg.mad/bin/internal/madevent_interface.py index 853aabc98a..cb6bf4ca57 100755 --- a/epochX/cudacpp/gg_ttgg.mad/bin/internal/madevent_interface.py +++ b/epochX/cudacpp/gg_ttgg.mad/bin/internal/madevent_interface.py @@ -3191,7 +3191,7 @@ def do_treatcards(self, line, mode=None, opt=None): if mode in ['run', 'all']: if not hasattr(self, 'run_card'): - run_card = banner_mod.RunCard(opt['run_card']) + run_card = banner_mod.RunCard(opt['run_card'], path=pjoin(self.me_dir, 'Cards', 'run_card.dat')) else: run_card = self.run_card self.run_card = run_card diff --git a/epochX/cudacpp/gg_ttgg.mad/mg5.in b/epochX/cudacpp/gg_ttgg.mad/mg5.in index e2c5858b63..53784bf161 100644 --- a/epochX/cudacpp/gg_ttgg.mad/mg5.in +++ b/epochX/cudacpp/gg_ttgg.mad/mg5.in @@ -1,4 +1,4 @@ set stdout_level DEBUG set zerowidth_tchannel F generate g g > t t~ g g -output madevent gg_ttgg.mad --hel_recycling=False --vector_size=16384 --me_exporter=standalone_cudacpp +output madevent gg_ttgg.mad --hel_recycling=False --vector_size=32 --me_exporter=standalone_cudacpp diff --git a/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt b/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt index 80631c94bf..d62aabc436 100644 --- a/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt +++ b/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ g g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.00567317008972168  +DEBUG: model prefixing takes 0.005473136901855469  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -161,29 +161,29 @@ output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_ttgg Load PLUGIN.CUDACPP_OUTPUT Output will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: cformat =  plugin [export_cpp.py at line 3071]  -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 155]  -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 160]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 160]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 165]  INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ g g WEIGHTED<=4 @1 INFO: Processing color information for process: g g > t t~ g g @1 -DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 189]  -DEBUG: type(subproc_group)= [output.py at line 190]  -DEBUG: type(fortran_model)= [output.py at line 191]  -DEBUG: type(me)= me=0 [output.py at line 192]  +DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 194]  +DEBUG: type(subproc_group)= [output.py at line 195]  +DEBUG: type(fortran_model)= [output.py at line 196]  +DEBUG: type(me)= me=0 [output.py at line 197]  INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/./CPPProcess.h FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/./CPPProcess.cc INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/. -Generated helas calls for 1 subprocesses (123 diagrams) in 0.423 s -DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 197]  +Generated helas calls for 1 subprocesses (123 diagrams) in 0.420 s +DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.318 s +ALOHA: aloha creates 5 routines in 0.894 s VVV1 VVV1 FFV1 @@ -204,9 +204,8 @@ FileWriter for / FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/. -DEBUG: 'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) =  Entering PLUGIN_ProcessExporter.finalize False [output.py at line 206]  quit -real 0m4.435s -user 0m1.373s +real 0m2.023s +user 0m1.361s sys 0m0.056s diff --git a/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt b/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt index ab3974344c..d94e7252af 100644 --- a/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt +++ b/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ g g g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005319833755493164  +DEBUG: model prefixing takes 0.005348920822143555  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -155,17 +155,17 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=5: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g g g WEIGHTED<=5 @1 INFO: Process has 1240 diagrams -1 processes with 1240 diagrams generated in 1.855 s +1 processes with 1240 diagrams generated in 1.857 s Total: 1 processes with 1240 diagrams -output madevent ../TMPOUT/CODEGEN_mad_gg_ttggg --hel_recycling=False --vector_size=16384 --me_exporter=standalone_cudacpp +output madevent ../TMPOUT/CODEGEN_mad_gg_ttggg --hel_recycling=False --vector_size=32 --me_exporter=standalone_cudacpp Load PLUGIN.CUDACPP_OUTPUT Addition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT Output will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: cformat =  standalone_cudacpp [export_cpp.py at line 3071]  -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 155]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 160]  INFO: initialize a new directory: CODEGEN_mad_gg_ttggg INFO: remove old information in CODEGEN_mad_gg_ttggg -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 160]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 165]  WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg  INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards  @@ -175,9 +175,9 @@ INFO: Generating Helas calls for process: g g > t t~ g g g WEIGHTED<=5 @1 INFO: Processing color information for process: g g > t t~ g g g @1 INFO: Creating files in directory P1_gg_ttxggg INFO: Computing Color-Flow optimization [15120 term] -INFO: Color-Flow passed to 1630 term in 8s. Introduce 3030 contraction +INFO: Color-Flow passed to 1630 term in 7s. Introduce 3030 contraction DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -186,29 +186,29 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: config_map =  [1, 2, 0, 3, 4, 0, 5, 6, 0, 0, 0, 0, 0, 7, 8, 9, 0, 10, 11, 12, 0, 13, 14, 15, 0, 16, 17, 18, 19, 20, 21, 0, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 0, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 0, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 0, 67, 68, 69, 70, 71, 72, 73, 74, 75, 0, 76, 77, 78, 79, 80, 81, 82, 83, 84, 0, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 0, 0, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 0, 121, 122, 0, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 0, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 0, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 0, 197, 198, 199, 200, 201, 202, 0, 203, 204, 205, 206, 207, 208, 0, 209, 210, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 0, 226, 227, 0, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 0, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 0, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 0, 302, 303, 304, 305, 306, 307, 0, 308, 309, 310, 311, 312, 313, 0, 314, 315, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 316, 317, 318, 319, 320, 321, 0, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 0, 337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 0, 352, 353, 354, 355, 356, 357, 358, 359, 360, 361, 362, 363, 364, 365, 366, 0, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 0, 378, 379, 0, 380, 381, 0, 0, 0, 0, 0, 382, 383, 384, 385, 386, 387, 388, 389, 390, 0, 391, 392, 393, 394, 395, 396, 397, 398, 399, 0, 400, 401, 402, 403, 404, 405, 406, 407, 408, 0, 409, 410, 411, 412, 413, 414, 0, 415, 416, 417, 418, 419, 420, 0, 0, 0, 421, 422, 423, 424, 425, 426, 0, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, 0, 442, 443, 444, 445, 446, 447, 448, 449, 450, 451, 452, 453, 454, 455, 456, 0, 457, 458, 459, 460, 461, 462, 463, 464, 465, 466, 467, 468, 469, 470, 471, 0, 472, 473, 474, 475, 476, 477, 478, 479, 480, 481, 482, 0, 483, 484, 0, 485, 486, 0, 0, 0, 0, 0, 487, 488, 489, 490, 491, 492, 493, 494, 495, 0, 496, 497, 498, 499, 500, 501, 502, 503, 504, 0, 505, 506, 507, 508, 509, 510, 511, 512, 513, 0, 514, 515, 516, 517, 518, 519, 0, 520, 521, 522, 523, 524, 525, 0, 0, 0, 526, 527, 528, 529, 530, 531, 0, 532, 533, 534, 535, 536, 537, 538, 539, 540, 541, 542, 543, 544, 545, 546, 0, 547, 548, 549, 550, 551, 552, 553, 554, 555, 556, 557, 558, 559, 560, 561, 0, 562, 563, 564, 565, 566, 567, 568, 569, 570, 571, 572, 573, 574, 575, 576, 0, 577, 578, 579, 580, 581, 582, 583, 584, 585, 586, 587, 0, 588, 589, 0, 590, 591, 0, 0, 0, 0, 0, 592, 593, 594, 595, 596, 597, 598, 599, 600, 0, 601, 602, 603, 604, 605, 606, 607, 608, 609, 0, 610, 611, 612, 613, 614, 615, 616, 617, 618, 0, 619, 620, 621, 622, 623, 624, 0, 625, 626, 627, 628, 629, 630, 0, 0, 0, 631, 632, 633, 634, 635, 636, 637, 638, 639, 640, 641, 642, 643, 644, 645, 646, 647, 648, 649, 650, 651, 652, 653, 654, 655, 656, 657, 658, 659, 660, 661, 662, 663, 0, 664, 665, 666, 667, 668, 669, 0, 670, 671, 672, 673, 674, 675, 0, 0, 0, 676, 677, 678, 679, 680, 681, 682, 683, 684, 685, 686, 687, 688, 689, 690, 691, 692, 693, 694, 695, 696, 697, 698, 699, 700, 701, 702, 703, 704, 705, 706, 707, 708, 0, 709, 710, 711, 712, 713, 714, 0, 715, 716, 717, 718, 719, 720, 0, 0, 0, 721, 722, 0, 723, 724, 0, 725, 726, 0, 0, 0, 0, 0, 727, 728, 729, 730, 731, 732, 733, 734, 735, 0, 736, 737, 738, 739, 740, 741, 742, 743, 744, 0, 745, 746, 747, 748, 749, 750, 751, 752, 753, 0, 754, 755, 756, 757, 758, 759, 0, 760, 761, 762, 763, 764, 765, 766, 767, 0, 768, 769, 0, 770, 771, 0, 0, 0, 0, 0, 772, 773, 774, 775, 776, 777, 778, 779, 780, 0, 781, 782, 783, 784, 785, 786, 787, 788, 789, 0, 790, 791, 792, 793, 794, 795, 796, 797, 798, 0, 799, 800, 801, 802, 803, 804, 0, 805, 806, 807, 808, 809, 810, 811, 812, 0, 813, 814, 0, 815, 816, 0, 0, 0, 0, 0, 817, 818, 819, 820, 821, 822, 823, 824, 825, 0, 826, 827, 828, 829, 830, 831, 832, 833, 834, 0, 835, 836, 837, 838, 839, 840, 841, 842, 843, 0, 844, 845, 846, 847, 848, 849, 0, 850, 851, 852, 853, 854, 855, 856, 857, 0, 858, 859, 0, 860, 861, 0, 0, 0, 0, 862, 863, 0, 864, 865, 0, 866, 867, 0, 0, 0, 0, 868, 869, 0, 870, 871, 0, 872, 873, 0, 0, 0, 0, 0, 0, 0, 874, 875, 876, 877, 878, 879, 880, 881, 882, 883, 884, 885, 886, 887, 888, 889, 890, 891, 0, 892, 893, 894, 895, 896, 897, 898, 899, 900, 901, 902, 903, 904, 905, 906, 907, 908, 909, 0, 910, 911, 912, 913, 914, 915, 916, 917, 918, 919, 920, 921, 922, 923, 924, 925, 926, 927, 0, 928, 929, 930, 931, 932, 933, 0, 934, 935, 936, 937, 938, 939, 0, 940, 941, 942, 943, 944, 945, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] [export_cpp.py at line 711]  DEBUG: subproc_number =  0 [export_cpp.py at line 712]  DEBUG: Done [export_cpp.py at line 713]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872]  INFO: Generating Feynman diagrams for Process: g g > t t~ g g g WEIGHTED<=5 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttxggg -Generated helas calls for 1 subprocesses (1240 diagrams) in 6.574 s -Wrote files for 2281 helas calls in 18.431 s +Generated helas calls for 1 subprocesses (1240 diagrams) in 6.607 s +Wrote files for 2281 helas calls in 18.169 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.335 s -DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 197]  +ALOHA: aloha creates 5 routines in 0.317 s +DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 10 routines in 0.313 s +ALOHA: aloha creates 10 routines in 0.308 s VVV1 VVV1 FFV1 @@ -235,7 +235,6 @@ save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CO INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate web pages -DEBUG: 'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) =  Entering PLUGIN_ProcessExporter.finalize True [output.py at line 206]  DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common patching file Source/genps.inc patching file Source/makefile @@ -251,16 +250,16 @@ Hunk #2 succeeded at 255 (offset 112 lines). Hunk #3 succeeded at 333 (offset 112 lines). Hunk #4 succeeded at 361 (offset 112 lines). Hunk #5 succeeded at 406 (offset 112 lines). -DEBUG: p.returncode =  0 [output.py at line 232]  +DEBUG: p.returncode =  0 [output.py at line 237]  Output to directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg done. Type "launch" to generate events from this process, or see /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/README Run "open index.html" to see more information about this process. quit -real 0m32.103s -user 0m28.586s -sys 0m0.412s +real 0m28.894s +user 0m28.357s +sys 0m0.382s ************************************************************ * * * W E L C O M E to * @@ -288,9 +287,6 @@ Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt No valid web browser found. Please set in ./input/mg5_configuration.txt treatcards run -run_card missed argument cudacpp_backend. Takes default: CPP -run_card missed argument cudacpp_backend. Takes default: CPP -WARNING! CPPRunCard instance has no attribute path quit INFO: launch in debug mode diff --git a/epochX/cudacpp/gg_ttggg.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/gg_ttggg.mad/Cards/proc_card_mg5.dat index 2f92ecc4ba..3923568dd8 100644 --- a/epochX/cudacpp/gg_ttggg.mad/Cards/proc_card_mg5.dat +++ b/epochX/cudacpp/gg_ttggg.mad/Cards/proc_card_mg5.dat @@ -46,4 +46,4 @@ define l- = e- mu- define vl = ve vm vt define vl~ = ve~ vm~ vt~ output madevent ../TMPOUT/CODEGEN_mad_gg_ttggg --hel_recycling=False -\ --vector_size=16384 --me_exporter=standalone_cudacpp +-vector_size=32 --me_exporter=standalone_cudacpp diff --git a/epochX/cudacpp/gg_ttggg.mad/Cards/run_card.dat b/epochX/cudacpp/gg_ttggg.mad/Cards/run_card.dat index 0a6bf20eb9..8170176a11 100644 --- a/epochX/cudacpp/gg_ttggg.mad/Cards/run_card.dat +++ b/epochX/cudacpp/gg_ttggg.mad/Cards/run_card.dat @@ -80,7 +80,23 @@ 1 = sde_strategy ! default integration strategy (hep-ph/2021.00773) ! 1 is old strategy (using amp square) ! 2 is new strategy (using only the denominator) -# To see advanced option for Phase-Space optimization: type "update psoptim" +#********************************************************************* +# Phase-Space Optim (advanced) +#********************************************************************* + 0 = job_strategy ! see appendix of 1507.00020 (page 26) + 0 = hard_survey ! force to have better estimate of the integral at survey for difficult mode like interference + -1.0 = tmin_for_channel ! limit the non-singular reach of --some-- channel of integration related to T-channel diagram (value between -1 and 0), -1 is no impact + -1 = survey_splitting ! for loop-induced control how many core are used at survey for the computation of a single iteration. + 2 = survey_nchannel_per_job ! control how many Channel are integrated inside a single job on cluster/multicore + -1 = refine_evt_by_job ! control the maximal number of events for the first iteration of the refine (larger means less jobs) +#********************************************************************* +# Compilation flag. +#********************************************************************* + -O3 -ffast-math -fbounds-check = global_flag ! build flags for all Fortran code (for a fair comparison to cudacpp; default is -O) + --fast-math = aloha_flag ! fortran optimization flag for aloha function. Suggestions: '-ffast-math' + -O3 = matrix_flag ! fortran optimization flag for matrix.f function. Suggestions: '-O3' + 16384 = vector_size ! size of fortran arrays allocated in the multi-event API for SIMD/GPU (VECSIZE_MEMMAX) + #********************************************************************* # Customization (custom cuts/scale/bias/...) * # list of files containing fortran function that overwrite default * @@ -174,12 +190,10 @@ systematics = systematics_program ! none, systematics [python], SysCalc [depreceted, C++] ['--mur=0.5,1,2', '--muf=0.5,1,2', '--pdf=errorset'] = systematics_arguments ! see: https://cp3.irmp.ucl.ac.be/projects/madgraph/wiki/Systematics#Systematicspythonmodule -#********************************************************************* -# Options for the cudacpp plugin -#********************************************************************* - -# Set cudacpp-specific values of non-cudacpp-specific options --O3 -ffast-math -fbounds-check = global_flag ! build flags for Fortran code (for a fair comparison to cudacpp) +#*********************************************************************** +# SIMD/GPU configuration for the CUDACPP plugin +#************************************************************************ + d = floating_type ! floating point precision: f (single), d (double), m (mixed: double for amplitudes, single for colors) + auto = avx_level ! SIMD vectorization level: none, sse4, avx2, 512y, 512z, auto + CPP = cudacpp_backend ! CUDACPP backend: FORTRAN, CPP, CUDA -# New cudacpp-specific options (default values are defined in banner.py) -CPP = cudacpp_backend ! valid backends are FORTRAN, CPP, CUDA diff --git a/epochX/cudacpp/gg_ttggg.mad/Cards/run_card_default.dat b/epochX/cudacpp/gg_ttggg.mad/Cards/run_card_default.dat index 3714e71997..8da9ac1563 100644 --- a/epochX/cudacpp/gg_ttggg.mad/Cards/run_card_default.dat +++ b/epochX/cudacpp/gg_ttggg.mad/Cards/run_card_default.dat @@ -80,7 +80,23 @@ 2 = sde_strategy ! default integration strategy (hep-ph/2021.00773) ! 1 is old strategy (using amp square) ! 2 is new strategy (using only the denominator) -# To see advanced option for Phase-Space optimization: type "update psoptim" +#********************************************************************* +# Phase-Space Optim (advanced) +#********************************************************************* + 0 = job_strategy ! see appendix of 1507.00020 (page 26) + 0 = hard_survey ! force to have better estimate of the integral at survey for difficult mode like interference + -1.0 = tmin_for_channel ! limit the non-singular reach of --some-- channel of integration related to T-channel diagram (value between -1 and 0), -1 is no impact + -1 = survey_splitting ! for loop-induced control how many core are used at survey for the computation of a single iteration. + 2 = survey_nchannel_per_job ! control how many Channel are integrated inside a single job on cluster/multicore + -1 = refine_evt_by_job ! control the maximal number of events for the first iteration of the refine (larger means less jobs) +#********************************************************************* +# Compilation flag. +#********************************************************************* + -O = global_flag ! fortran optimization flag use for the all code. + --fast-math = aloha_flag ! fortran optimization flag for aloha function. Suggestions: '-ffast-math' + -O3 = matrix_flag ! fortran optimization flag for matrix.f function. Suggestions: '-O3' + 16 = vector_size ! size of fortran arrays allocated in the multi-event API for SIMD/GPU (VECSIZE_MEMMAX) + #********************************************************************* # Customization (custom cuts/scale/bias/...) * # list of files containing fortran function that overwrite default * @@ -173,3 +189,11 @@ # systematics = systematics_program ! none, systematics [python], SysCalc [depreceted, C++] ['--mur=0.5,1,2', '--muf=0.5,1,2', '--pdf=errorset'] = systematics_arguments ! see: https://cp3.irmp.ucl.ac.be/projects/madgraph/wiki/Systematics#Systematicspythonmodule + +#*********************************************************************** +# SIMD/GPU configuration for the CUDACPP plugin +#************************************************************************ + d = floating_type ! floating point precision: f (single), d (double), m (mixed: double for amplitudes, single for colors) + auto = avx_level ! SIMD vectorization level: none, sse4, avx2, 512y, 512z, auto + CPP = cudacpp_backend ! CUDACPP backend: FORTRAN, CPP, CUDA + diff --git a/epochX/cudacpp/gg_ttggg.mad/Source/make_opts b/epochX/cudacpp/gg_ttggg.mad/Source/make_opts index 6409e99a49..e4b87ee6ad 100644 --- a/epochX/cudacpp/gg_ttggg.mad/Source/make_opts +++ b/epochX/cudacpp/gg_ttggg.mad/Source/make_opts @@ -1,7 +1,6 @@ DEFAULT_CPP_COMPILER=g++ DEFAULT_F2PY_COMPILER=f2py3 DEFAULT_F_COMPILER=gfortran -#end_of_make_opts_variables GLOBAL_FLAG=-O3 -ffast-math -fbounds-check MACFLAG= MG5AMC_VERSION=SpecifiedByMG5aMCAtRunTime diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/banner.py b/epochX/cudacpp/gg_ttggg.mad/bin/internal/banner.py index 3995ce8109..bd1517985f 100755 --- a/epochX/cudacpp/gg_ttggg.mad/bin/internal/banner.py +++ b/epochX/cudacpp/gg_ttggg.mad/bin/internal/banner.py @@ -2822,13 +2822,15 @@ def add_param(self, name, value, fortran_name=None, include=True, if fct_mod: self.fct_mod[name] = fct_mod - def read(self, finput, consistency=True, unknown_warning=True): + def read(self, finput, consistency=True, unknown_warning=True, **opt): """Read the input file, this can be a path to a file, a file object, a str with the content of the file.""" if isinstance(finput, str): if "\n" in finput: finput = finput.split('\n') + if 'path' in opt: + self.path = opt['path'] elif os.path.isfile(finput): self.path = finput finput = open(finput) @@ -3913,7 +3915,7 @@ def remove_all_cut(self): %(global_flag)s = global_flag ! fortran optimization flag use for the all code. %(aloha_flag)s = aloha_flag ! fortran optimization flag for aloha function. Suggestions: '-ffast-math' %(matrix_flag)s = matrix_flag ! fortran optimization flag for matrix.f function. Suggestions: '-O3' - %(vector_size)s = vector_size ! size designed for SIMD/OpenMP/GPU (number of events in lockstep) + %(vector_size)s = vector_size ! size of fortran arrays allocated in the multi-event API for SIMD/GPU (VECSIZE_MEMMAX) """ template_off = '# To see advanced option for Phase-Space optimization: type "update psoptim"' diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/gg_ttggg.mad/bin/internal/launch_plugin.py index 0b849330ef..3b09713e12 100644 --- a/epochX/cudacpp/gg_ttggg.mad/bin/internal/launch_plugin.py +++ b/epochX/cudacpp/gg_ttggg.mad/bin/internal/launch_plugin.py @@ -16,22 +16,28 @@ import internal.misc as misc import internal.extended_cmd as extended_cmd import internal.banner as banner_mod + import internal.common_run_interface as common_run_interface else: import madgraph.interface.madevent_interface as madevent_interface import madgraph.various.misc as misc import madgraph.interface.extended_cmd as extended_cmd import madgraph.various.banner as banner_mod + import madgraph.interface.common_run_interface as common_run_interface class CPPMEInterface(madevent_interface.MadEventCmdShell): def compile(self, *args, **opts): """ """ import multiprocessing if not self.options['nb_core'] or self.options['nb_core'] == 'None': - self.options['nb_core'] = multiprocessing.cpu_count() - if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'): - import pathlib - import os - pjoin = os.path.join + self.options['nb_core'] = multiprocessing.cpu_count() + if 'cwd' in opts and os.path.basename(opts['cwd']) == 'Source': + path = pjoin(opts['cwd'], 'make_opts') + avx_level = self.run_card['avx_level'] if self.run_card['avx_level'] != 'auto' else '' + common_run_interface.CommonRunCmd.update_make_opts_full(path, + {'FPTYPE': self.run_card['floating_type'], + 'AVX': avx_level }) + misc.sprint('FPTYPE checked') + if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'): cudacpp_backend = self.run_card['cudacpp_backend'].upper() # the default value is defined in banner.py logger.info("Building madevent in madevent_interface.py with '%s' matrix elements"%cudacpp_backend) if cudacpp_backend == 'FORTRAN': @@ -46,30 +52,83 @@ def compile(self, *args, **opts): else: return misc.compile(nb_core=self.options['nb_core'], *args, **opts) +# Phase-Space Optimization ------------------------------------------------------------------------------------ +template_on = \ +"""#*********************************************************************** +# SIMD/GPU configuration for the CUDACPP plugin +#************************************************************************ + %(floating_type)s = floating_type ! floating point precision: f (single), d (double), m (mixed: double for amplitudes, single for colors) + %(avx_level)s = avx_level ! SIMD vectorization level: none, sse4, avx2, 512y, 512z, auto + %(cudacpp_backend)s = cudacpp_backend ! CUDACPP backend: FORTRAN, CPP, CUDA +""" + +template_off = '' +plugin_block = banner_mod.RunBlock('simd', template_on=template_on, template_off=template_off) + class CPPRunCard(banner_mod.RunCardLO): + blocks = banner_mod.RunCardLO.blocks + [plugin_block] + def reset_simd(self, old_value, new_value, name): if not hasattr(self, 'path'): - logger.warning('WARNING! CPPRunCard instance has no attribute path') - return - ###raise Exception('INTERNAL ERROR! CPPRunCard instance has no attribute path') + raise Exception('INTERNAL ERROR! CPPRunCard instance has no attribute path') # now ok after fixing #790 if name == "vector_size" and new_value <= int(old_value): # code can handle the new size -> do not recompile return Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source') subprocess.call(['make', 'cleanavx'], cwd=Sourcedir, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) + def reset_makeopts(self, old_value, new_value, name): + if not hasattr(self, 'path'): + raise Exception + avx_value = self['avx_level'] if self['avx_level'] != 'auto' else '' + if name == 'floating_type': + common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': new_value, 'AVX': avx_value}) + elif name == 'avx_level': + if new_value == 'auto': new_value = '' + common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': self['floating_type'], 'AVX': new_value}) + else: + raise Exception + Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source') + subprocess.call(['make', 'cleanavx'], cwd=Sourcedir, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) + def plugin_input(self, finput): return def default_setup(self): super().default_setup() - self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False) + self.add_param('floating_type', 'd', include=False, hidden=False, + fct_mod=(self.reset_makeopts,(),{}), + allowed=['m','d','f']) + self.add_param('avx_level', 'auto', include=False, hidden=False, + fct_mod=(self.reset_makeopts,(),{}), + allowed=['auto', 'none', 'sse4', 'avx2','512y','512z']) + self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False, + allowed=['Fortan', 'CPP', 'CUDA']) + self['vector_size'] = 16 # already setup in default class (just change value) + self['aloha_flag'] = '--fast-math' + self['matrix_flag'] = '-O3' + self.display_block.append('simd') + self.display_block.append('psoptim') + # OM/AV - overload the default version in banner.py def write_one_include_file(self, output_dir, incname, output_file=None): """write one include file at the time""" - if incname == "vector.inc" and 'vector_size' not in self.user_set: - return - super().write_one_include_file(output_dir, incname, output_file) + if incname == "vector.inc": + if 'vector_size' not in self.user_set: return + if output_file is None: vectorinc=pjoin(output_dir,incname) + else: vectorinc=output_file + with open(vectorinc+'.new','w') as fileout: + with open(vectorinc) as filein: + for line in filein: + if line.startswith('C'): fileout.write(line) + super().write_one_include_file(output_dir, incname, output_file) + with open(vectorinc+'.new','a') as fileout: + with open(vectorinc) as filein: + for line in filein: + if not line.startswith('\n'): fileout.write(line) + os.replace(vectorinc+'.new',vectorinc) + else: + super().write_one_include_file(output_dir, incname, output_file) def check_validity(self): """ensure that PLUGIN information are consistent""" @@ -82,13 +141,10 @@ def check_validity(self): class GPURunCard(CPPRunCard): def default_setup(self): - super(CPPRunCard, self).default_setup() - self.add_param('cudacpp_backend', 'CUDA', include=False, hidden=False) - -#class CUDACPPRunCard(CPPRunCard): -# def default_setup(self): -# super(CPPRunCard, self).default_setup() -# self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False) + super().default_setup() + # change default value: + self['cudacpp_backend'] = 'CUDA' + self['vector_size'] = 16384 # already setup in default class (just change value) MEINTERFACE = CPPMEInterface RunCard = CPPRunCard diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/madevent_interface.py b/epochX/cudacpp/gg_ttggg.mad/bin/internal/madevent_interface.py index 853aabc98a..cb6bf4ca57 100755 --- a/epochX/cudacpp/gg_ttggg.mad/bin/internal/madevent_interface.py +++ b/epochX/cudacpp/gg_ttggg.mad/bin/internal/madevent_interface.py @@ -3191,7 +3191,7 @@ def do_treatcards(self, line, mode=None, opt=None): if mode in ['run', 'all']: if not hasattr(self, 'run_card'): - run_card = banner_mod.RunCard(opt['run_card']) + run_card = banner_mod.RunCard(opt['run_card'], path=pjoin(self.me_dir, 'Cards', 'run_card.dat')) else: run_card = self.run_card self.run_card = run_card diff --git a/epochX/cudacpp/gg_ttggg.mad/mg5.in b/epochX/cudacpp/gg_ttggg.mad/mg5.in index cdbc845cdd..f92d17d219 100644 --- a/epochX/cudacpp/gg_ttggg.mad/mg5.in +++ b/epochX/cudacpp/gg_ttggg.mad/mg5.in @@ -1,4 +1,4 @@ set stdout_level DEBUG set zerowidth_tchannel F generate g g > t t~ g g g -output madevent gg_ttggg.mad --hel_recycling=False --vector_size=16384 --me_exporter=standalone_cudacpp +output madevent gg_ttggg.mad --hel_recycling=False --vector_size=32 --me_exporter=standalone_cudacpp diff --git a/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt b/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt index 33bae20142..8660fec52a 100644 --- a/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt +++ b/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ g g g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005532503128051758  +DEBUG: model prefixing takes 0.005609273910522461  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -155,35 +155,35 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=5: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g g g WEIGHTED<=5 @1 INFO: Process has 1240 diagrams -1 processes with 1240 diagrams generated in 1.880 s +1 processes with 1240 diagrams generated in 1.857 s Total: 1 processes with 1240 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_ttggg Load PLUGIN.CUDACPP_OUTPUT Output will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: cformat =  plugin [export_cpp.py at line 3071]  -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 155]  -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 160]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 160]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 165]  INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ g g g WEIGHTED<=5 @1 INFO: Processing color information for process: g g > t t~ g g g @1 -DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 189]  -DEBUG: type(subproc_group)= [output.py at line 190]  -DEBUG: type(fortran_model)= [output.py at line 191]  -DEBUG: type(me)= me=0 [output.py at line 192]  +DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 194]  +DEBUG: type(subproc_group)= [output.py at line 195]  +DEBUG: type(fortran_model)= [output.py at line 196]  +DEBUG: type(me)= me=0 [output.py at line 197]  INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/./CPPProcess.h FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/./CPPProcess.cc INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/. -Generated helas calls for 1 subprocesses (1240 diagrams) in 6.540 s -DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 197]  +Generated helas calls for 1 subprocesses (1240 diagrams) in 6.536 s +DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.351 s +ALOHA: aloha creates 5 routines in 0.341 s VVV1 VVV1 FFV1 @@ -204,9 +204,8 @@ FileWriter for / FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/. -DEBUG: 'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) =  Entering PLUGIN_ProcessExporter.finalize False [output.py at line 206]  quit -real 0m15.959s -user 0m12.810s -sys 0m0.102s +real 0m12.866s +user 0m12.717s +sys 0m0.100s diff --git a/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt b/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt index 89cb2749b0..97f5e25170 100644 --- a/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt +++ b/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt @@ -61,7 +61,7 @@ set zerowidth_tchannel F define q = u c d s u~ c~ d~ s~ INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.0057373046875  +DEBUG: model prefixing takes 0.005373477935791016  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -170,17 +170,17 @@ INFO: Crossed process found for g u~ > t t~ u~, reuse diagrams. INFO: Crossed process found for g c~ > t t~ c~, reuse diagrams. INFO: Crossed process found for g d~ > t t~ d~, reuse diagrams. INFO: Crossed process found for g s~ > t t~ s~, reuse diagrams. -8 processes with 40 diagrams generated in 0.078 s +8 processes with 40 diagrams generated in 0.077 s Total: 8 processes with 40 diagrams -output madevent ../TMPOUT/CODEGEN_mad_gq_ttq --hel_recycling=False --vector_size=16384 --me_exporter=standalone_cudacpp +output madevent ../TMPOUT/CODEGEN_mad_gq_ttq --hel_recycling=False --vector_size=32 --me_exporter=standalone_cudacpp Load PLUGIN.CUDACPP_OUTPUT Addition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT Output will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: cformat =  standalone_cudacpp [export_cpp.py at line 3071]  -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 155]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 160]  INFO: initialize a new directory: CODEGEN_mad_gq_ttq INFO: remove old information in CODEGEN_mad_gq_ttq -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 160]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 165]  WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq  INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards  @@ -198,7 +198,7 @@ INFO: Combined process g d~ > t t~ d~ WEIGHTED<=3 @1 with process g u~ > t t~ u~ INFO: Combined process g s~ > t t~ s~ WEIGHTED<=3 @1 with process g u~ > t t~ u~ WEIGHTED<=3 @1 INFO: Creating files in directory P1_gu_ttxu DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -207,15 +207,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: config_map =  [1, 2, 3, 4, 5] [export_cpp.py at line 711]  DEBUG: subproc_number =  0 [export_cpp.py at line 712]  DEBUG: Done [export_cpp.py at line 713]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872]  INFO: Generating Feynman diagrams for Process: g u > t t~ u WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gu_ttxu INFO: Creating files in directory P1_gux_ttxux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -224,19 +224,19 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: config_map =  [1, 2, 3, 4, 5] [export_cpp.py at line 711]  DEBUG: subproc_number =  1 [export_cpp.py at line 712]  DEBUG: Done [export_cpp.py at line 713]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872]  INFO: Generating Feynman diagrams for Process: g u~ > t t~ u~ WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gux_ttxux Generated helas calls for 2 subprocesses (10 diagrams) in 0.031 s -Wrote files for 32 helas calls in 0.217 s +Wrote files for 32 helas calls in 0.219 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVV1 routines ALOHA: aloha creates 2 routines in 0.144 s -DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 197]  +DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVV1 routines @@ -260,7 +260,6 @@ save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CO INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate web pages -DEBUG: 'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) =  Entering PLUGIN_ProcessExporter.finalize True [output.py at line 206]  DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common patching file Source/genps.inc patching file Source/makefile @@ -288,16 +287,16 @@ Hunk #2 succeeded at 162 (offset 19 lines). Hunk #3 succeeded at 247 (offset 26 lines). Hunk #4 succeeded at 281 (offset 32 lines). Hunk #5 succeeded at 326 (offset 32 lines). -DEBUG: p.returncode =  0 [output.py at line 232]  +DEBUG: p.returncode =  0 [output.py at line 237]  Output to directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq done. Type "launch" to generate events from this process, or see /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/README Run "open index.html" to see more information about this process. quit -real 0m4.915s -user 0m1.680s -sys 0m0.237s +real 0m1.929s +user 0m1.701s +sys 0m0.227s ************************************************************ * * * W E L C O M E to * @@ -325,9 +324,6 @@ Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt No valid web browser found. Please set in ./input/mg5_configuration.txt treatcards run -run_card missed argument cudacpp_backend. Takes default: CPP -run_card missed argument cudacpp_backend. Takes default: CPP -WARNING! CPPRunCard instance has no attribute path quit INFO: launch in debug mode diff --git a/epochX/cudacpp/gq_ttq.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/gq_ttq.mad/Cards/proc_card_mg5.dat index efb0752a31..c49c39c3c4 100644 --- a/epochX/cudacpp/gq_ttq.mad/Cards/proc_card_mg5.dat +++ b/epochX/cudacpp/gq_ttq.mad/Cards/proc_card_mg5.dat @@ -48,4 +48,4 @@ define vl~ = ve~ vm~ vt~ define q = u c d s u~ c~ d~ s~ generate g q > t t~ q output madevent ../TMPOUT/CODEGEN_mad_gq_ttq --hel_recycling=False --v\ -ector_size=16384 --me_exporter=standalone_cudacpp +ector_size=32 --me_exporter=standalone_cudacpp diff --git a/epochX/cudacpp/gq_ttq.mad/Cards/run_card.dat b/epochX/cudacpp/gq_ttq.mad/Cards/run_card.dat index 69e6fa2bf2..d987e643f0 100644 --- a/epochX/cudacpp/gq_ttq.mad/Cards/run_card.dat +++ b/epochX/cudacpp/gq_ttq.mad/Cards/run_card.dat @@ -82,7 +82,23 @@ 1 = sde_strategy ! default integration strategy (hep-ph/2021.00773) ! 1 is old strategy (using amp square) ! 2 is new strategy (using only the denominator) -# To see advanced option for Phase-Space optimization: type "update psoptim" +#********************************************************************* +# Phase-Space Optim (advanced) +#********************************************************************* + 0 = job_strategy ! see appendix of 1507.00020 (page 26) + 0 = hard_survey ! force to have better estimate of the integral at survey for difficult mode like interference + -1.0 = tmin_for_channel ! limit the non-singular reach of --some-- channel of integration related to T-channel diagram (value between -1 and 0), -1 is no impact + -1 = survey_splitting ! for loop-induced control how many core are used at survey for the computation of a single iteration. + 2 = survey_nchannel_per_job ! control how many Channel are integrated inside a single job on cluster/multicore + -1 = refine_evt_by_job ! control the maximal number of events for the first iteration of the refine (larger means less jobs) +#********************************************************************* +# Compilation flag. +#********************************************************************* + -O3 -ffast-math -fbounds-check = global_flag ! build flags for all Fortran code (for a fair comparison to cudacpp; default is -O) + --fast-math = aloha_flag ! fortran optimization flag for aloha function. Suggestions: '-ffast-math' + -O3 = matrix_flag ! fortran optimization flag for matrix.f function. Suggestions: '-O3' + 16384 = vector_size ! size of fortran arrays allocated in the multi-event API for SIMD/GPU (VECSIZE_MEMMAX) + #********************************************************************* # Customization (custom cuts/scale/bias/...) * # list of files containing fortran function that overwrite default * @@ -145,12 +161,10 @@ systematics = systematics_program ! none, systematics [python], SysCalc [depreceted, C++] ['--mur=0.5,1,2', '--muf=0.5,1,2', '--pdf=errorset'] = systematics_arguments ! see: https://cp3.irmp.ucl.ac.be/projects/madgraph/wiki/Systematics#Systematicspythonmodule -#********************************************************************* -# Options for the cudacpp plugin -#********************************************************************* - -# Set cudacpp-specific values of non-cudacpp-specific options --O3 -ffast-math -fbounds-check = global_flag ! build flags for Fortran code (for a fair comparison to cudacpp) +#*********************************************************************** +# SIMD/GPU configuration for the CUDACPP plugin +#************************************************************************ + d = floating_type ! floating point precision: f (single), d (double), m (mixed: double for amplitudes, single for colors) + auto = avx_level ! SIMD vectorization level: none, sse4, avx2, 512y, 512z, auto + CPP = cudacpp_backend ! CUDACPP backend: FORTRAN, CPP, CUDA -# New cudacpp-specific options (default values are defined in banner.py) -CPP = cudacpp_backend ! valid backends are FORTRAN, CPP, CUDA diff --git a/epochX/cudacpp/gq_ttq.mad/Cards/run_card_default.dat b/epochX/cudacpp/gq_ttq.mad/Cards/run_card_default.dat index 4c99f39248..ef9e4b2707 100644 --- a/epochX/cudacpp/gq_ttq.mad/Cards/run_card_default.dat +++ b/epochX/cudacpp/gq_ttq.mad/Cards/run_card_default.dat @@ -82,7 +82,23 @@ 1 = sde_strategy ! default integration strategy (hep-ph/2021.00773) ! 1 is old strategy (using amp square) ! 2 is new strategy (using only the denominator) -# To see advanced option for Phase-Space optimization: type "update psoptim" +#********************************************************************* +# Phase-Space Optim (advanced) +#********************************************************************* + 0 = job_strategy ! see appendix of 1507.00020 (page 26) + 0 = hard_survey ! force to have better estimate of the integral at survey for difficult mode like interference + -1.0 = tmin_for_channel ! limit the non-singular reach of --some-- channel of integration related to T-channel diagram (value between -1 and 0), -1 is no impact + -1 = survey_splitting ! for loop-induced control how many core are used at survey for the computation of a single iteration. + 2 = survey_nchannel_per_job ! control how many Channel are integrated inside a single job on cluster/multicore + -1 = refine_evt_by_job ! control the maximal number of events for the first iteration of the refine (larger means less jobs) +#********************************************************************* +# Compilation flag. +#********************************************************************* + -O = global_flag ! fortran optimization flag use for the all code. + --fast-math = aloha_flag ! fortran optimization flag for aloha function. Suggestions: '-ffast-math' + -O3 = matrix_flag ! fortran optimization flag for matrix.f function. Suggestions: '-O3' + 16 = vector_size ! size of fortran arrays allocated in the multi-event API for SIMD/GPU (VECSIZE_MEMMAX) + #********************************************************************* # Customization (custom cuts/scale/bias/...) * # list of files containing fortran function that overwrite default * @@ -144,3 +160,11 @@ # systematics = systematics_program ! none, systematics [python], SysCalc [depreceted, C++] ['--mur=0.5,1,2', '--muf=0.5,1,2', '--pdf=errorset'] = systematics_arguments ! see: https://cp3.irmp.ucl.ac.be/projects/madgraph/wiki/Systematics#Systematicspythonmodule + +#*********************************************************************** +# SIMD/GPU configuration for the CUDACPP plugin +#************************************************************************ + d = floating_type ! floating point precision: f (single), d (double), m (mixed: double for amplitudes, single for colors) + auto = avx_level ! SIMD vectorization level: none, sse4, avx2, 512y, 512z, auto + CPP = cudacpp_backend ! CUDACPP backend: FORTRAN, CPP, CUDA + diff --git a/epochX/cudacpp/gq_ttq.mad/Source/make_opts b/epochX/cudacpp/gq_ttq.mad/Source/make_opts index 6409e99a49..e4b87ee6ad 100644 --- a/epochX/cudacpp/gq_ttq.mad/Source/make_opts +++ b/epochX/cudacpp/gq_ttq.mad/Source/make_opts @@ -1,7 +1,6 @@ DEFAULT_CPP_COMPILER=g++ DEFAULT_F2PY_COMPILER=f2py3 DEFAULT_F_COMPILER=gfortran -#end_of_make_opts_variables GLOBAL_FLAG=-O3 -ffast-math -fbounds-check MACFLAG= MG5AMC_VERSION=SpecifiedByMG5aMCAtRunTime diff --git a/epochX/cudacpp/gq_ttq.mad/bin/internal/banner.py b/epochX/cudacpp/gq_ttq.mad/bin/internal/banner.py index 3995ce8109..bd1517985f 100755 --- a/epochX/cudacpp/gq_ttq.mad/bin/internal/banner.py +++ b/epochX/cudacpp/gq_ttq.mad/bin/internal/banner.py @@ -2822,13 +2822,15 @@ def add_param(self, name, value, fortran_name=None, include=True, if fct_mod: self.fct_mod[name] = fct_mod - def read(self, finput, consistency=True, unknown_warning=True): + def read(self, finput, consistency=True, unknown_warning=True, **opt): """Read the input file, this can be a path to a file, a file object, a str with the content of the file.""" if isinstance(finput, str): if "\n" in finput: finput = finput.split('\n') + if 'path' in opt: + self.path = opt['path'] elif os.path.isfile(finput): self.path = finput finput = open(finput) @@ -3913,7 +3915,7 @@ def remove_all_cut(self): %(global_flag)s = global_flag ! fortran optimization flag use for the all code. %(aloha_flag)s = aloha_flag ! fortran optimization flag for aloha function. Suggestions: '-ffast-math' %(matrix_flag)s = matrix_flag ! fortran optimization flag for matrix.f function. Suggestions: '-O3' - %(vector_size)s = vector_size ! size designed for SIMD/OpenMP/GPU (number of events in lockstep) + %(vector_size)s = vector_size ! size of fortran arrays allocated in the multi-event API for SIMD/GPU (VECSIZE_MEMMAX) """ template_off = '# To see advanced option for Phase-Space optimization: type "update psoptim"' diff --git a/epochX/cudacpp/gq_ttq.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/gq_ttq.mad/bin/internal/launch_plugin.py index 0b849330ef..3b09713e12 100644 --- a/epochX/cudacpp/gq_ttq.mad/bin/internal/launch_plugin.py +++ b/epochX/cudacpp/gq_ttq.mad/bin/internal/launch_plugin.py @@ -16,22 +16,28 @@ import internal.misc as misc import internal.extended_cmd as extended_cmd import internal.banner as banner_mod + import internal.common_run_interface as common_run_interface else: import madgraph.interface.madevent_interface as madevent_interface import madgraph.various.misc as misc import madgraph.interface.extended_cmd as extended_cmd import madgraph.various.banner as banner_mod + import madgraph.interface.common_run_interface as common_run_interface class CPPMEInterface(madevent_interface.MadEventCmdShell): def compile(self, *args, **opts): """ """ import multiprocessing if not self.options['nb_core'] or self.options['nb_core'] == 'None': - self.options['nb_core'] = multiprocessing.cpu_count() - if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'): - import pathlib - import os - pjoin = os.path.join + self.options['nb_core'] = multiprocessing.cpu_count() + if 'cwd' in opts and os.path.basename(opts['cwd']) == 'Source': + path = pjoin(opts['cwd'], 'make_opts') + avx_level = self.run_card['avx_level'] if self.run_card['avx_level'] != 'auto' else '' + common_run_interface.CommonRunCmd.update_make_opts_full(path, + {'FPTYPE': self.run_card['floating_type'], + 'AVX': avx_level }) + misc.sprint('FPTYPE checked') + if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'): cudacpp_backend = self.run_card['cudacpp_backend'].upper() # the default value is defined in banner.py logger.info("Building madevent in madevent_interface.py with '%s' matrix elements"%cudacpp_backend) if cudacpp_backend == 'FORTRAN': @@ -46,30 +52,83 @@ def compile(self, *args, **opts): else: return misc.compile(nb_core=self.options['nb_core'], *args, **opts) +# Phase-Space Optimization ------------------------------------------------------------------------------------ +template_on = \ +"""#*********************************************************************** +# SIMD/GPU configuration for the CUDACPP plugin +#************************************************************************ + %(floating_type)s = floating_type ! floating point precision: f (single), d (double), m (mixed: double for amplitudes, single for colors) + %(avx_level)s = avx_level ! SIMD vectorization level: none, sse4, avx2, 512y, 512z, auto + %(cudacpp_backend)s = cudacpp_backend ! CUDACPP backend: FORTRAN, CPP, CUDA +""" + +template_off = '' +plugin_block = banner_mod.RunBlock('simd', template_on=template_on, template_off=template_off) + class CPPRunCard(banner_mod.RunCardLO): + blocks = banner_mod.RunCardLO.blocks + [plugin_block] + def reset_simd(self, old_value, new_value, name): if not hasattr(self, 'path'): - logger.warning('WARNING! CPPRunCard instance has no attribute path') - return - ###raise Exception('INTERNAL ERROR! CPPRunCard instance has no attribute path') + raise Exception('INTERNAL ERROR! CPPRunCard instance has no attribute path') # now ok after fixing #790 if name == "vector_size" and new_value <= int(old_value): # code can handle the new size -> do not recompile return Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source') subprocess.call(['make', 'cleanavx'], cwd=Sourcedir, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) + def reset_makeopts(self, old_value, new_value, name): + if not hasattr(self, 'path'): + raise Exception + avx_value = self['avx_level'] if self['avx_level'] != 'auto' else '' + if name == 'floating_type': + common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': new_value, 'AVX': avx_value}) + elif name == 'avx_level': + if new_value == 'auto': new_value = '' + common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': self['floating_type'], 'AVX': new_value}) + else: + raise Exception + Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source') + subprocess.call(['make', 'cleanavx'], cwd=Sourcedir, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) + def plugin_input(self, finput): return def default_setup(self): super().default_setup() - self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False) + self.add_param('floating_type', 'd', include=False, hidden=False, + fct_mod=(self.reset_makeopts,(),{}), + allowed=['m','d','f']) + self.add_param('avx_level', 'auto', include=False, hidden=False, + fct_mod=(self.reset_makeopts,(),{}), + allowed=['auto', 'none', 'sse4', 'avx2','512y','512z']) + self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False, + allowed=['Fortan', 'CPP', 'CUDA']) + self['vector_size'] = 16 # already setup in default class (just change value) + self['aloha_flag'] = '--fast-math' + self['matrix_flag'] = '-O3' + self.display_block.append('simd') + self.display_block.append('psoptim') + # OM/AV - overload the default version in banner.py def write_one_include_file(self, output_dir, incname, output_file=None): """write one include file at the time""" - if incname == "vector.inc" and 'vector_size' not in self.user_set: - return - super().write_one_include_file(output_dir, incname, output_file) + if incname == "vector.inc": + if 'vector_size' not in self.user_set: return + if output_file is None: vectorinc=pjoin(output_dir,incname) + else: vectorinc=output_file + with open(vectorinc+'.new','w') as fileout: + with open(vectorinc) as filein: + for line in filein: + if line.startswith('C'): fileout.write(line) + super().write_one_include_file(output_dir, incname, output_file) + with open(vectorinc+'.new','a') as fileout: + with open(vectorinc) as filein: + for line in filein: + if not line.startswith('\n'): fileout.write(line) + os.replace(vectorinc+'.new',vectorinc) + else: + super().write_one_include_file(output_dir, incname, output_file) def check_validity(self): """ensure that PLUGIN information are consistent""" @@ -82,13 +141,10 @@ def check_validity(self): class GPURunCard(CPPRunCard): def default_setup(self): - super(CPPRunCard, self).default_setup() - self.add_param('cudacpp_backend', 'CUDA', include=False, hidden=False) - -#class CUDACPPRunCard(CPPRunCard): -# def default_setup(self): -# super(CPPRunCard, self).default_setup() -# self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False) + super().default_setup() + # change default value: + self['cudacpp_backend'] = 'CUDA' + self['vector_size'] = 16384 # already setup in default class (just change value) MEINTERFACE = CPPMEInterface RunCard = CPPRunCard diff --git a/epochX/cudacpp/gq_ttq.mad/bin/internal/madevent_interface.py b/epochX/cudacpp/gq_ttq.mad/bin/internal/madevent_interface.py index 853aabc98a..cb6bf4ca57 100755 --- a/epochX/cudacpp/gq_ttq.mad/bin/internal/madevent_interface.py +++ b/epochX/cudacpp/gq_ttq.mad/bin/internal/madevent_interface.py @@ -3191,7 +3191,7 @@ def do_treatcards(self, line, mode=None, opt=None): if mode in ['run', 'all']: if not hasattr(self, 'run_card'): - run_card = banner_mod.RunCard(opt['run_card']) + run_card = banner_mod.RunCard(opt['run_card'], path=pjoin(self.me_dir, 'Cards', 'run_card.dat')) else: run_card = self.run_card self.run_card = run_card diff --git a/epochX/cudacpp/gq_ttq.mad/mg5.in b/epochX/cudacpp/gq_ttq.mad/mg5.in index e93843b8cd..2273ae9cfd 100644 --- a/epochX/cudacpp/gq_ttq.mad/mg5.in +++ b/epochX/cudacpp/gq_ttq.mad/mg5.in @@ -2,4 +2,4 @@ set stdout_level DEBUG set zerowidth_tchannel F define q = u c d s u~ c~ d~ s~ generate g q > t t~ q -output madevent gq_ttq.mad --hel_recycling=False --vector_size=16384 --me_exporter=standalone_cudacpp +output madevent gq_ttq.mad --hel_recycling=False --vector_size=32 --me_exporter=standalone_cudacpp diff --git a/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt b/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt index 16374bd28e..91509797eb 100644 --- a/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt +++ b/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt @@ -61,7 +61,7 @@ set zerowidth_tchannel F define q = u c d s u~ c~ d~ s~ INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005791902542114258  +DEBUG: model prefixing takes 0.005265951156616211  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -170,14 +170,14 @@ INFO: Crossed process found for g u~ > t t~ u~, reuse diagrams. INFO: Crossed process found for g c~ > t t~ c~, reuse diagrams. INFO: Crossed process found for g d~ > t t~ d~, reuse diagrams. INFO: Crossed process found for g s~ > t t~ s~, reuse diagrams. -8 processes with 40 diagrams generated in 0.078 s +8 processes with 40 diagrams generated in 0.077 s Total: 8 processes with 40 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gq_ttq Load PLUGIN.CUDACPP_OUTPUT Output will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: cformat =  plugin [export_cpp.py at line 3071]  -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 155]  -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 160]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 160]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 165]  INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g u > t t~ u WEIGHTED<=3 @1 @@ -190,28 +190,28 @@ INFO: Processing color information for process: g u~ > t t~ u~ @1 INFO: Combined process g c~ > t t~ c~ WEIGHTED<=3 @1 with process g u~ > t t~ u~ WEIGHTED<=3 @1 INFO: Combined process g d~ > t t~ d~ WEIGHTED<=3 @1 with process g u~ > t t~ u~ WEIGHTED<=3 @1 INFO: Combined process g s~ > t t~ s~ WEIGHTED<=3 @1 with process g u~ > t t~ u~ WEIGHTED<=3 @1 -DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 189]  -DEBUG: type(subproc_group)= [output.py at line 190]  -DEBUG: type(fortran_model)= [output.py at line 191]  -DEBUG: type(me)= me=0 [output.py at line 192]  +DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 194]  +DEBUG: type(subproc_group)= [output.py at line 195]  +DEBUG: type(fortran_model)= [output.py at line 196]  +DEBUG: type(me)= me=0 [output.py at line 197]  INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu/./CPPProcess.h FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu/./CPPProcess.cc INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu/. -DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 189]  -DEBUG: type(subproc_group)= [output.py at line 190]  -DEBUG: type(fortran_model)= [output.py at line 191]  -DEBUG: type(me)= me=1 [output.py at line 192]  +DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 194]  +DEBUG: type(subproc_group)= [output.py at line 195]  +DEBUG: type(fortran_model)= [output.py at line 196]  +DEBUG: type(me)= me=1 [output.py at line 197]  INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux/./CPPProcess.h FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux/./CPPProcess.cc INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux/. Generated helas calls for 2 subprocesses (10 diagrams) in 0.030 s -DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 197]  +DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVV1 routines -ALOHA: aloha creates 2 routines in 0.144 s +ALOHA: aloha creates 2 routines in 0.141 s FFV1 FFV1 FFV1 @@ -225,9 +225,8 @@ FileWriter for / FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/. -DEBUG: 'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) =  Entering PLUGIN_ProcessExporter.finalize False [output.py at line 206]  quit -real 0m3.656s -user 0m0.594s -sys 0m0.059s +real 0m0.640s +user 0m0.574s +sys 0m0.061s diff --git a/epochX/cudacpp/heft_gg_h.sa/CODEGEN_cudacpp_heft_gg_h_log.txt b/epochX/cudacpp/heft_gg_h.sa/CODEGEN_cudacpp_heft_gg_h_log.txt index 3b04fc3fb3..452dbff73e 100644 --- a/epochX/cudacpp/heft_gg_h.sa/CODEGEN_cudacpp_heft_gg_h_log.txt +++ b/epochX/cudacpp/heft_gg_h.sa/CODEGEN_cudacpp_heft_gg_h_log.txt @@ -135,25 +135,25 @@ output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_heft_gg_h Load PLUGIN.CUDACPP_OUTPUT Output will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: cformat =  plugin [export_cpp.py at line 3071]  -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 155]  -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 160]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 160]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 165]  INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > h HIG<=1 HIW<=1 WEIGHTED<=2 @1 INFO: Processing color information for process: g g > h HIG<=1 HIW<=1 @1 -DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 189]  -DEBUG: type(subproc_group)= [output.py at line 190]  -DEBUG: type(fortran_model)= [output.py at line 191]  -DEBUG: type(me)= me=0 [output.py at line 192]  +DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 194]  +DEBUG: type(subproc_group)= [output.py at line 195]  +DEBUG: type(fortran_model)= [output.py at line 196]  +DEBUG: type(me)= me=0 [output.py at line 197]  INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h/SubProcesses/P1_Sigma_heft_gg_h FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h/SubProcesses/P1_Sigma_heft_gg_h/./CPPProcess.h FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h/SubProcesses/P1_Sigma_heft_gg_h/./CPPProcess.cc INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h/SubProcesses/P1_Sigma_heft_gg_h/. Generated helas calls for 1 subprocesses (1 diagrams) in 0.002 s -DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 197]  +DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVS3 routines -ALOHA: aloha creates 1 routines in 0.062 s +ALOHA: aloha creates 1 routines in 0.060 s VVS3 FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h/src/./HelAmps_heft.h INFO: Created file HelAmps_heft.h in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h/src/. @@ -163,9 +163,8 @@ FileWriter for / FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h/src/./Parameters_heft.cc INFO: Created files Parameters_heft.h and Parameters_heft.cc in directory INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h/src/. -DEBUG: 'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) =  Entering PLUGIN_ProcessExporter.finalize False [output.py at line 206]  quit -real 0m3.422s -user 0m0.371s -sys 0m0.048s +real 0m0.415s +user 0m0.358s +sys 0m0.050s diff --git a/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt b/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt index 8b6ca99446..e73dd42300 100644 --- a/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt +++ b/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt @@ -61,7 +61,7 @@ set zerowidth_tchannel F define j = p INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.00538325309753418  +DEBUG: model prefixing takes 0.0053882598876953125  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -172,7 +172,7 @@ INFO: Process u~ u > t t~ added to mirror process u u~ > t t~ INFO: Process c~ c > t t~ added to mirror process c c~ > t t~ INFO: Process d~ d > t t~ added to mirror process d d~ > t t~ INFO: Process s~ s > t t~ added to mirror process s s~ > t t~ -5 processes with 7 diagrams generated in 0.029 s +5 processes with 7 diagrams generated in 0.031 s Total: 5 processes with 7 diagrams add process p p > t t~ j @1 INFO: Checking for minimal orders which gives processes. @@ -212,7 +212,7 @@ INFO: Process d~ g > t t~ d~ added to mirror process g d~ > t t~ d~ INFO: Process d~ d > t t~ g added to mirror process d d~ > t t~ g INFO: Process s~ g > t t~ s~ added to mirror process g s~ > t t~ s~ INFO: Process s~ s > t t~ g added to mirror process s s~ > t t~ g -13 processes with 76 diagrams generated in 0.134 s +13 processes with 76 diagrams generated in 0.141 s Total: 18 processes with 83 diagrams add process p p > t t~ j j @2 INFO: Checking for minimal orders which gives processes. @@ -378,17 +378,17 @@ INFO: Process s~ u~ > t t~ u~ s~ added to mirror process u~ s~ > t t~ u~ s~ INFO: Process s~ c~ > t t~ c~ s~ added to mirror process c~ s~ > t t~ c~ s~ INFO: Process s~ d~ > t t~ d~ s~ added to mirror process d~ s~ > t t~ d~ s~ INFO: Crossed process found for s~ s~ > t t~ s~ s~, reuse diagrams. -65 processes with 1119 diagrams generated in 1.811 s +65 processes with 1119 diagrams generated in 1.922 s Total: 83 processes with 1202 diagrams -output madevent ../TMPOUT/CODEGEN_mad_pp_tt012j --hel_recycling=False --vector_size=16384 --me_exporter=standalone_cudacpp +output madevent ../TMPOUT/CODEGEN_mad_pp_tt012j --hel_recycling=False --vector_size=32 --me_exporter=standalone_cudacpp Load PLUGIN.CUDACPP_OUTPUT Addition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT Output will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: cformat =  standalone_cudacpp [export_cpp.py at line 3071]  -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 155]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 160]  INFO: initialize a new directory: CODEGEN_mad_pp_tt012j INFO: remove old information in CODEGEN_mad_pp_tt012j -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 160]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 165]  WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j  INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards  @@ -497,7 +497,7 @@ INFO: Combined process d d~ > t t~ WEIGHTED<=2 with process u u~ > t t~ WEIGHTED INFO: Combined process s s~ > t t~ WEIGHTED<=2 with process u u~ > t t~ WEIGHTED<=2 INFO: Creating files in directory P2_gg_ttxgg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -506,15 +506,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: config_map =  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 0, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 0, 46, 47, 48, 49, 50, 51, 52, 53, 54, 0, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 0, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 0, 88, 89, 90, 91, 92, 93, 0, 94, 95, 96, 97, 98, 99, 0, 100, 101, 102, 103, 104, 105, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] [export_cpp.py at line 711]  DEBUG: subproc_number =  0 [export_cpp.py at line 712]  DEBUG: Done [export_cpp.py at line 713]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872]  INFO: Generating Feynman diagrams for Process: g g > t t~ g g WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group gg_ttxgg INFO: Creating files in directory P2_gg_ttxuux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -523,15 +523,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: config_map =  [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 0, 34, 35] [export_cpp.py at line 711]  DEBUG: subproc_number =  1 [export_cpp.py at line 712]  DEBUG: Done [export_cpp.py at line 713]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872]  INFO: Generating Feynman diagrams for Process: g g > t t~ u u~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group gg_ttxuux INFO: Creating files in directory P2_gu_ttxgu DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -540,15 +540,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: config_map =  [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 0, 34, 35] [export_cpp.py at line 711]  DEBUG: subproc_number =  2 [export_cpp.py at line 712]  DEBUG: Done [export_cpp.py at line 713]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872]  INFO: Generating Feynman diagrams for Process: g u > t t~ g u WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group gu_ttxgu INFO: Creating files in directory P2_gux_ttxgux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -557,15 +557,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: config_map =  [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 0, 34, 35] [export_cpp.py at line 711]  DEBUG: subproc_number =  3 [export_cpp.py at line 712]  DEBUG: Done [export_cpp.py at line 713]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872]  INFO: Generating Feynman diagrams for Process: g u~ > t t~ g u~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group gux_ttxgux INFO: Creating files in directory P2_uux_ttxgg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -574,15 +574,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: config_map =  [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 0, 34, 35] [export_cpp.py at line 711]  DEBUG: subproc_number =  4 [export_cpp.py at line 712]  DEBUG: Done [export_cpp.py at line 713]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872]  INFO: Generating Feynman diagrams for Process: u u~ > t t~ g g WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uux_ttxgg INFO: Creating files in directory P1_gg_ttxg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -591,15 +591,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: config_map =  [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0] [export_cpp.py at line 711]  DEBUG: subproc_number =  5 [export_cpp.py at line 712]  DEBUG: Done [export_cpp.py at line 713]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872]  INFO: Generating Feynman diagrams for Process: g g > t t~ g WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttxg INFO: Creating files in directory P2_uu_ttxuu DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -608,15 +608,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: config_map =  [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14] [export_cpp.py at line 711]  DEBUG: subproc_number =  6 [export_cpp.py at line 712]  DEBUG: Done [export_cpp.py at line 713]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872]  INFO: Generating Feynman diagrams for Process: u u > t t~ u u WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uu_ttxuu INFO: Creating files in directory P2_uux_ttxuux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -625,15 +625,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: config_map =  [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14] [export_cpp.py at line 711]  DEBUG: subproc_number =  7 [export_cpp.py at line 712]  DEBUG: Done [export_cpp.py at line 713]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872]  INFO: Generating Feynman diagrams for Process: u u~ > t t~ u u~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uux_ttxuux INFO: Creating files in directory P2_uxux_ttxuxux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -642,15 +642,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: config_map =  [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14] [export_cpp.py at line 711]  DEBUG: subproc_number =  8 [export_cpp.py at line 712]  DEBUG: Done [export_cpp.py at line 713]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872]  INFO: Generating Feynman diagrams for Process: u~ u~ > t t~ u~ u~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uxux_ttxuxux INFO: Creating files in directory P2_uc_ttxuc DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -659,15 +659,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: config_map =  [1, 2, 3, 4, 5, 6, 7] [export_cpp.py at line 711]  DEBUG: subproc_number =  9 [export_cpp.py at line 712]  DEBUG: Done [export_cpp.py at line 713]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872]  INFO: Generating Feynman diagrams for Process: u c > t t~ u c WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uc_ttxuc INFO: Creating files in directory P2_uux_ttxccx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -676,15 +676,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: config_map =  [1, 2, 3, 4, 5, 6, 7] [export_cpp.py at line 711]  DEBUG: subproc_number =  10 [export_cpp.py at line 712]  DEBUG: Done [export_cpp.py at line 713]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872]  INFO: Generating Feynman diagrams for Process: u u~ > t t~ c c~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uux_ttxccx INFO: Creating files in directory P2_ucx_ttxucx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -693,15 +693,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: config_map =  [1, 2, 3, 4, 5, 6, 7] [export_cpp.py at line 711]  DEBUG: subproc_number =  11 [export_cpp.py at line 712]  DEBUG: Done [export_cpp.py at line 713]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872]  INFO: Generating Feynman diagrams for Process: u c~ > t t~ u c~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group ucx_ttxucx INFO: Creating files in directory P2_uxcx_ttxuxcx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -710,15 +710,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: config_map =  [1, 2, 3, 4, 5, 6, 7] [export_cpp.py at line 711]  DEBUG: subproc_number =  12 [export_cpp.py at line 712]  DEBUG: Done [export_cpp.py at line 713]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872]  INFO: Generating Feynman diagrams for Process: u~ c~ > t t~ u~ c~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uxcx_ttxuxcx INFO: Creating files in directory P1_gu_ttxu DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -727,15 +727,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: config_map =  [1, 2, 3, 4, 5] [export_cpp.py at line 711]  DEBUG: subproc_number =  13 [export_cpp.py at line 712]  DEBUG: Done [export_cpp.py at line 713]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872]  INFO: Generating Feynman diagrams for Process: g u > t t~ u WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gu_ttxu INFO: Creating files in directory P1_gux_ttxux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -744,15 +744,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: config_map =  [1, 2, 3, 4, 5] [export_cpp.py at line 711]  DEBUG: subproc_number =  14 [export_cpp.py at line 712]  DEBUG: Done [export_cpp.py at line 713]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872]  INFO: Generating Feynman diagrams for Process: g u~ > t t~ u~ WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gux_ttxux INFO: Creating files in directory P1_uux_ttxg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -761,15 +761,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: config_map =  [1, 2, 3, 4, 5] [export_cpp.py at line 711]  DEBUG: subproc_number =  15 [export_cpp.py at line 712]  DEBUG: Done [export_cpp.py at line 713]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872]  INFO: Generating Feynman diagrams for Process: u u~ > t t~ g WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group uux_ttxg INFO: Creating files in directory P0_gg_ttx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -778,15 +778,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: config_map =  [1, 2, 3] [export_cpp.py at line 711]  DEBUG: subproc_number =  16 [export_cpp.py at line 712]  DEBUG: Done [export_cpp.py at line 713]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872]  INFO: Generating Feynman diagrams for Process: g g > t t~ WEIGHTED<=2 INFO: Finding symmetric diagrams for subprocess group gg_ttx INFO: Creating files in directory P0_uux_ttx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -795,29 +795,29 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: config_map =  [1] [export_cpp.py at line 711]  DEBUG: subproc_number =  17 [export_cpp.py at line 712]  DEBUG: Done [export_cpp.py at line 713]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872]  INFO: Generating Feynman diagrams for Process: u u~ > t t~ WEIGHTED<=2 INFO: Finding symmetric diagrams for subprocess group uux_ttx -Generated helas calls for 18 subprocesses (372 diagrams) in 1.267 s -Wrote files for 810 helas calls in 3.215 s +Generated helas calls for 18 subprocesses (372 diagrams) in 1.279 s +Wrote files for 810 helas calls in 3.193 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.333 s -DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 197]  +ALOHA: aloha creates 5 routines in 0.331 s +DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 10 routines in 0.312 s +ALOHA: aloha creates 10 routines in 0.310 s VVV1 VVV1 FFV1 @@ -844,7 +844,6 @@ save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CO INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate web pages -DEBUG: 'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) =  Entering PLUGIN_ProcessExporter.finalize True [output.py at line 206]  DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common patching file Source/genps.inc patching file Source/makefile @@ -1022,16 +1021,16 @@ Hunk #2 succeeded at 194 (offset 51 lines). Hunk #3 succeeded at 272 (offset 51 lines). Hunk #4 succeeded at 300 (offset 51 lines). Hunk #5 succeeded at 345 (offset 51 lines). -DEBUG: p.returncode =  0 [output.py at line 232]  +DEBUG: p.returncode =  0 [output.py at line 237]  Output to directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j done. Type "launch" to generate events from this process, or see /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/README Run "open index.html" to see more information about this process. quit -real 0m11.764s -user 0m8.242s -sys 0m0.480s +real 0m8.865s +user 0m8.358s +sys 0m0.477s ************************************************************ * * * W E L C O M E to * @@ -1059,9 +1058,6 @@ Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt No valid web browser found. Please set in ./input/mg5_configuration.txt treatcards run -run_card missed argument cudacpp_backend. Takes default: CPP -run_card missed argument cudacpp_backend. Takes default: CPP -WARNING! CPPRunCard instance has no attribute path quit INFO: launch in debug mode diff --git a/epochX/cudacpp/pp_tt012j.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/pp_tt012j.mad/Cards/proc_card_mg5.dat index c0b1a2fd98..22837639b6 100644 --- a/epochX/cudacpp/pp_tt012j.mad/Cards/proc_card_mg5.dat +++ b/epochX/cudacpp/pp_tt012j.mad/Cards/proc_card_mg5.dat @@ -50,4 +50,4 @@ generate p p > t t~ @0 add process p p > t t~ j @1 add process p p > t t~ j j @2 output madevent ../TMPOUT/CODEGEN_mad_pp_tt012j --hel_recycling=False \ ---vector_size=16384 --me_exporter=standalone_cudacpp +--vector_size=32 --me_exporter=standalone_cudacpp diff --git a/epochX/cudacpp/pp_tt012j.mad/Cards/run_card.dat b/epochX/cudacpp/pp_tt012j.mad/Cards/run_card.dat index f4ca7c0b5d..8b1007a3e5 100644 --- a/epochX/cudacpp/pp_tt012j.mad/Cards/run_card.dat +++ b/epochX/cudacpp/pp_tt012j.mad/Cards/run_card.dat @@ -98,7 +98,23 @@ 1 = sde_strategy ! default integration strategy (hep-ph/2021.00773) ! 1 is old strategy (using amp square) ! 2 is new strategy (using only the denominator) -# To see advanced option for Phase-Space optimization: type "update psoptim" +#********************************************************************* +# Phase-Space Optim (advanced) +#********************************************************************* + 0 = job_strategy ! see appendix of 1507.00020 (page 26) + 0 = hard_survey ! force to have better estimate of the integral at survey for difficult mode like interference + -1.0 = tmin_for_channel ! limit the non-singular reach of --some-- channel of integration related to T-channel diagram (value between -1 and 0), -1 is no impact + -1 = survey_splitting ! for loop-induced control how many core are used at survey for the computation of a single iteration. + 2 = survey_nchannel_per_job ! control how many Channel are integrated inside a single job on cluster/multicore + -1 = refine_evt_by_job ! control the maximal number of events for the first iteration of the refine (larger means less jobs) +#********************************************************************* +# Compilation flag. +#********************************************************************* + -O3 -ffast-math -fbounds-check = global_flag ! build flags for all Fortran code (for a fair comparison to cudacpp; default is -O) + --fast-math = aloha_flag ! fortran optimization flag for aloha function. Suggestions: '-ffast-math' + -O3 = matrix_flag ! fortran optimization flag for matrix.f function. Suggestions: '-O3' + 16384 = vector_size ! size of fortran arrays allocated in the multi-event API for SIMD/GPU (VECSIZE_MEMMAX) + #********************************************************************* # Customization (custom cuts/scale/bias/...) * # list of files containing fortran function that overwrite default * @@ -186,12 +202,10 @@ systematics = systematics_program ! none, systematics [python], SysCalc [depreceted, C++] ['--mur=0.5,1,2', '--muf=0.5,1,2', '--pdf=errorset', '--alps=0.5,1,2'] = systematics_arguments ! see: https://cp3.irmp.ucl.ac.be/projects/madgraph/wiki/Systematics#Systematicspythonmodule -#********************************************************************* -# Options for the cudacpp plugin -#********************************************************************* - -# Set cudacpp-specific values of non-cudacpp-specific options --O3 -ffast-math -fbounds-check = global_flag ! build flags for Fortran code (for a fair comparison to cudacpp) +#*********************************************************************** +# SIMD/GPU configuration for the CUDACPP plugin +#************************************************************************ + d = floating_type ! floating point precision: f (single), d (double), m (mixed: double for amplitudes, single for colors) + auto = avx_level ! SIMD vectorization level: none, sse4, avx2, 512y, 512z, auto + CPP = cudacpp_backend ! CUDACPP backend: FORTRAN, CPP, CUDA -# New cudacpp-specific options (default values are defined in banner.py) -CPP = cudacpp_backend ! valid backends are FORTRAN, CPP, CUDA diff --git a/epochX/cudacpp/pp_tt012j.mad/Cards/run_card_default.dat b/epochX/cudacpp/pp_tt012j.mad/Cards/run_card_default.dat index 58e12c11d7..2f72b65255 100644 --- a/epochX/cudacpp/pp_tt012j.mad/Cards/run_card_default.dat +++ b/epochX/cudacpp/pp_tt012j.mad/Cards/run_card_default.dat @@ -98,7 +98,23 @@ 1 = sde_strategy ! default integration strategy (hep-ph/2021.00773) ! 1 is old strategy (using amp square) ! 2 is new strategy (using only the denominator) -# To see advanced option for Phase-Space optimization: type "update psoptim" +#********************************************************************* +# Phase-Space Optim (advanced) +#********************************************************************* + 0 = job_strategy ! see appendix of 1507.00020 (page 26) + 0 = hard_survey ! force to have better estimate of the integral at survey for difficult mode like interference + -1.0 = tmin_for_channel ! limit the non-singular reach of --some-- channel of integration related to T-channel diagram (value between -1 and 0), -1 is no impact + -1 = survey_splitting ! for loop-induced control how many core are used at survey for the computation of a single iteration. + 2 = survey_nchannel_per_job ! control how many Channel are integrated inside a single job on cluster/multicore + -1 = refine_evt_by_job ! control the maximal number of events for the first iteration of the refine (larger means less jobs) +#********************************************************************* +# Compilation flag. +#********************************************************************* + -O = global_flag ! fortran optimization flag use for the all code. + --fast-math = aloha_flag ! fortran optimization flag for aloha function. Suggestions: '-ffast-math' + -O3 = matrix_flag ! fortran optimization flag for matrix.f function. Suggestions: '-O3' + 16 = vector_size ! size of fortran arrays allocated in the multi-event API for SIMD/GPU (VECSIZE_MEMMAX) + #********************************************************************* # Customization (custom cuts/scale/bias/...) * # list of files containing fortran function that overwrite default * @@ -185,3 +201,11 @@ # systematics = systematics_program ! none, systematics [python], SysCalc [depreceted, C++] ['--mur=0.5,1,2', '--muf=0.5,1,2', '--pdf=errorset', '--alps=0.5,1,2'] = systematics_arguments ! see: https://cp3.irmp.ucl.ac.be/projects/madgraph/wiki/Systematics#Systematicspythonmodule + +#*********************************************************************** +# SIMD/GPU configuration for the CUDACPP plugin +#************************************************************************ + d = floating_type ! floating point precision: f (single), d (double), m (mixed: double for amplitudes, single for colors) + auto = avx_level ! SIMD vectorization level: none, sse4, avx2, 512y, 512z, auto + CPP = cudacpp_backend ! CUDACPP backend: FORTRAN, CPP, CUDA + diff --git a/epochX/cudacpp/pp_tt012j.mad/Source/make_opts b/epochX/cudacpp/pp_tt012j.mad/Source/make_opts index 6409e99a49..e4b87ee6ad 100644 --- a/epochX/cudacpp/pp_tt012j.mad/Source/make_opts +++ b/epochX/cudacpp/pp_tt012j.mad/Source/make_opts @@ -1,7 +1,6 @@ DEFAULT_CPP_COMPILER=g++ DEFAULT_F2PY_COMPILER=f2py3 DEFAULT_F_COMPILER=gfortran -#end_of_make_opts_variables GLOBAL_FLAG=-O3 -ffast-math -fbounds-check MACFLAG= MG5AMC_VERSION=SpecifiedByMG5aMCAtRunTime diff --git a/epochX/cudacpp/pp_tt012j.mad/bin/internal/banner.py b/epochX/cudacpp/pp_tt012j.mad/bin/internal/banner.py index 3995ce8109..bd1517985f 100755 --- a/epochX/cudacpp/pp_tt012j.mad/bin/internal/banner.py +++ b/epochX/cudacpp/pp_tt012j.mad/bin/internal/banner.py @@ -2822,13 +2822,15 @@ def add_param(self, name, value, fortran_name=None, include=True, if fct_mod: self.fct_mod[name] = fct_mod - def read(self, finput, consistency=True, unknown_warning=True): + def read(self, finput, consistency=True, unknown_warning=True, **opt): """Read the input file, this can be a path to a file, a file object, a str with the content of the file.""" if isinstance(finput, str): if "\n" in finput: finput = finput.split('\n') + if 'path' in opt: + self.path = opt['path'] elif os.path.isfile(finput): self.path = finput finput = open(finput) @@ -3913,7 +3915,7 @@ def remove_all_cut(self): %(global_flag)s = global_flag ! fortran optimization flag use for the all code. %(aloha_flag)s = aloha_flag ! fortran optimization flag for aloha function. Suggestions: '-ffast-math' %(matrix_flag)s = matrix_flag ! fortran optimization flag for matrix.f function. Suggestions: '-O3' - %(vector_size)s = vector_size ! size designed for SIMD/OpenMP/GPU (number of events in lockstep) + %(vector_size)s = vector_size ! size of fortran arrays allocated in the multi-event API for SIMD/GPU (VECSIZE_MEMMAX) """ template_off = '# To see advanced option for Phase-Space optimization: type "update psoptim"' diff --git a/epochX/cudacpp/pp_tt012j.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/pp_tt012j.mad/bin/internal/launch_plugin.py index 0b849330ef..3b09713e12 100644 --- a/epochX/cudacpp/pp_tt012j.mad/bin/internal/launch_plugin.py +++ b/epochX/cudacpp/pp_tt012j.mad/bin/internal/launch_plugin.py @@ -16,22 +16,28 @@ import internal.misc as misc import internal.extended_cmd as extended_cmd import internal.banner as banner_mod + import internal.common_run_interface as common_run_interface else: import madgraph.interface.madevent_interface as madevent_interface import madgraph.various.misc as misc import madgraph.interface.extended_cmd as extended_cmd import madgraph.various.banner as banner_mod + import madgraph.interface.common_run_interface as common_run_interface class CPPMEInterface(madevent_interface.MadEventCmdShell): def compile(self, *args, **opts): """ """ import multiprocessing if not self.options['nb_core'] or self.options['nb_core'] == 'None': - self.options['nb_core'] = multiprocessing.cpu_count() - if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'): - import pathlib - import os - pjoin = os.path.join + self.options['nb_core'] = multiprocessing.cpu_count() + if 'cwd' in opts and os.path.basename(opts['cwd']) == 'Source': + path = pjoin(opts['cwd'], 'make_opts') + avx_level = self.run_card['avx_level'] if self.run_card['avx_level'] != 'auto' else '' + common_run_interface.CommonRunCmd.update_make_opts_full(path, + {'FPTYPE': self.run_card['floating_type'], + 'AVX': avx_level }) + misc.sprint('FPTYPE checked') + if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'): cudacpp_backend = self.run_card['cudacpp_backend'].upper() # the default value is defined in banner.py logger.info("Building madevent in madevent_interface.py with '%s' matrix elements"%cudacpp_backend) if cudacpp_backend == 'FORTRAN': @@ -46,30 +52,83 @@ def compile(self, *args, **opts): else: return misc.compile(nb_core=self.options['nb_core'], *args, **opts) +# Phase-Space Optimization ------------------------------------------------------------------------------------ +template_on = \ +"""#*********************************************************************** +# SIMD/GPU configuration for the CUDACPP plugin +#************************************************************************ + %(floating_type)s = floating_type ! floating point precision: f (single), d (double), m (mixed: double for amplitudes, single for colors) + %(avx_level)s = avx_level ! SIMD vectorization level: none, sse4, avx2, 512y, 512z, auto + %(cudacpp_backend)s = cudacpp_backend ! CUDACPP backend: FORTRAN, CPP, CUDA +""" + +template_off = '' +plugin_block = banner_mod.RunBlock('simd', template_on=template_on, template_off=template_off) + class CPPRunCard(banner_mod.RunCardLO): + blocks = banner_mod.RunCardLO.blocks + [plugin_block] + def reset_simd(self, old_value, new_value, name): if not hasattr(self, 'path'): - logger.warning('WARNING! CPPRunCard instance has no attribute path') - return - ###raise Exception('INTERNAL ERROR! CPPRunCard instance has no attribute path') + raise Exception('INTERNAL ERROR! CPPRunCard instance has no attribute path') # now ok after fixing #790 if name == "vector_size" and new_value <= int(old_value): # code can handle the new size -> do not recompile return Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source') subprocess.call(['make', 'cleanavx'], cwd=Sourcedir, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) + def reset_makeopts(self, old_value, new_value, name): + if not hasattr(self, 'path'): + raise Exception + avx_value = self['avx_level'] if self['avx_level'] != 'auto' else '' + if name == 'floating_type': + common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': new_value, 'AVX': avx_value}) + elif name == 'avx_level': + if new_value == 'auto': new_value = '' + common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': self['floating_type'], 'AVX': new_value}) + else: + raise Exception + Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source') + subprocess.call(['make', 'cleanavx'], cwd=Sourcedir, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) + def plugin_input(self, finput): return def default_setup(self): super().default_setup() - self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False) + self.add_param('floating_type', 'd', include=False, hidden=False, + fct_mod=(self.reset_makeopts,(),{}), + allowed=['m','d','f']) + self.add_param('avx_level', 'auto', include=False, hidden=False, + fct_mod=(self.reset_makeopts,(),{}), + allowed=['auto', 'none', 'sse4', 'avx2','512y','512z']) + self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False, + allowed=['Fortan', 'CPP', 'CUDA']) + self['vector_size'] = 16 # already setup in default class (just change value) + self['aloha_flag'] = '--fast-math' + self['matrix_flag'] = '-O3' + self.display_block.append('simd') + self.display_block.append('psoptim') + # OM/AV - overload the default version in banner.py def write_one_include_file(self, output_dir, incname, output_file=None): """write one include file at the time""" - if incname == "vector.inc" and 'vector_size' not in self.user_set: - return - super().write_one_include_file(output_dir, incname, output_file) + if incname == "vector.inc": + if 'vector_size' not in self.user_set: return + if output_file is None: vectorinc=pjoin(output_dir,incname) + else: vectorinc=output_file + with open(vectorinc+'.new','w') as fileout: + with open(vectorinc) as filein: + for line in filein: + if line.startswith('C'): fileout.write(line) + super().write_one_include_file(output_dir, incname, output_file) + with open(vectorinc+'.new','a') as fileout: + with open(vectorinc) as filein: + for line in filein: + if not line.startswith('\n'): fileout.write(line) + os.replace(vectorinc+'.new',vectorinc) + else: + super().write_one_include_file(output_dir, incname, output_file) def check_validity(self): """ensure that PLUGIN information are consistent""" @@ -82,13 +141,10 @@ def check_validity(self): class GPURunCard(CPPRunCard): def default_setup(self): - super(CPPRunCard, self).default_setup() - self.add_param('cudacpp_backend', 'CUDA', include=False, hidden=False) - -#class CUDACPPRunCard(CPPRunCard): -# def default_setup(self): -# super(CPPRunCard, self).default_setup() -# self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False) + super().default_setup() + # change default value: + self['cudacpp_backend'] = 'CUDA' + self['vector_size'] = 16384 # already setup in default class (just change value) MEINTERFACE = CPPMEInterface RunCard = CPPRunCard diff --git a/epochX/cudacpp/pp_tt012j.mad/bin/internal/madevent_interface.py b/epochX/cudacpp/pp_tt012j.mad/bin/internal/madevent_interface.py index 853aabc98a..cb6bf4ca57 100755 --- a/epochX/cudacpp/pp_tt012j.mad/bin/internal/madevent_interface.py +++ b/epochX/cudacpp/pp_tt012j.mad/bin/internal/madevent_interface.py @@ -3191,7 +3191,7 @@ def do_treatcards(self, line, mode=None, opt=None): if mode in ['run', 'all']: if not hasattr(self, 'run_card'): - run_card = banner_mod.RunCard(opt['run_card']) + run_card = banner_mod.RunCard(opt['run_card'], path=pjoin(self.me_dir, 'Cards', 'run_card.dat')) else: run_card = self.run_card self.run_card = run_card diff --git a/epochX/cudacpp/pp_tt012j.mad/mg5.in b/epochX/cudacpp/pp_tt012j.mad/mg5.in index 91e22f5295..6bc40e7968 100644 --- a/epochX/cudacpp/pp_tt012j.mad/mg5.in +++ b/epochX/cudacpp/pp_tt012j.mad/mg5.in @@ -4,4 +4,4 @@ define j = p generate p p > t t~ @0 add process p p > t t~ j @1 add process p p > t t~ j j @2 -output madevent pp_tt012j.mad --hel_recycling=False --vector_size=16384 --me_exporter=standalone_cudacpp +output madevent pp_tt012j.mad --hel_recycling=False --vector_size=32 --me_exporter=standalone_cudacpp