From 95e9fd05d8ab8baa920760aaca93776430a71ad8 Mon Sep 17 00:00:00 2001 From: Olivier Mattelaer Date: Fri, 3 Nov 2023 08:24:09 +0100 Subject: [PATCH 01/28] add the option to choose the floating_point interface from the run_card (via make_opts) --- .../PLUGIN/CUDACPP_SA_OUTPUT/launch_plugin.py | 32 +++++++++++++++---- 1 file changed, 26 insertions(+), 6 deletions(-) diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/launch_plugin.py b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/launch_plugin.py index c9d1c7706a..56c7ebaea4 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/launch_plugin.py +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/launch_plugin.py @@ -16,11 +16,13 @@ import internal.misc as misc import internal.extended_cmd as extended_cmd import internal.banner as banner_mod + import internal.common_run_interface as common_run_interface else: import madgraph.interface.madevent_interface as madevent_interface import madgraph.various.misc as misc import madgraph.interface.extended_cmd as extended_cmd import madgraph.various.banner as banner_mod + import madgraph.interface.common_run_interface as common_run_interface class CPPMEInterface(madevent_interface.MadEventCmdShell): def compile(self, *args, **opts): @@ -32,6 +34,11 @@ def compile(self, *args, **opts): import pathlib import os pjoin = os.path.join + if 'cwd' in opts: + path = pjoin(opts['cwd'], os.pardir, os.pardir, 'Source', 'make_opts') + common_run_interface.CommonRunCmd.update_make_opts_full( + path, + {'FPTYPE': self.run_card['floating_type']}) cudacpp_backend = self.run_card['cudacpp_backend'].upper() # the default value is defined in banner.py logger.info("Building madevent in madevent_interface.py with '%s' matrix elements"%cudacpp_backend) if cudacpp_backend == 'FORTRAN': @@ -57,13 +64,29 @@ def reset_simd(self, old_value, new_value, name): return Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source') subprocess.call(['make', 'cleanavx'], cwd=Sourcedir, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) - + + def reset_makeopts(self, old_value, new_value, name): + + misc.sprint("PASS IN RESET MAKEOPTS", old_value, new_value, name) + if not hasattr(self, 'path'): + raise Exception + + if name == 'floating_type': + common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': new_value}) + else: + raise Exception + + Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source') + subprocess.call(['make', 'cleanavx'], cwd=Sourcedir, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) + def plugin_input(self, finput): return def default_setup(self): super().default_setup() self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False) + self.add_param('floating_type', 'd', include=False, hidden=False, + fct_mod=(self.reset_makeopts,(),{})) def write_one_include_file(self, output_dir, incname, output_file=None): """write one include file at the time""" @@ -79,16 +102,13 @@ def check_validity(self): self['sde_strategy'] = 1 if self['hel_recycling']: self['hel_recycling'] = False - + class GPURunCard(CPPRunCard): def default_setup(self): super(CPPRunCard, self).default_setup() self.add_param('cudacpp_backend', 'CUDA', include=False, hidden=False) + self.add_param('floating_type', 'd', include=False, hidden=False) -#class CUDACPPRunCard(CPPRunCard): -# def default_setup(self): -# super(CPPRunCard, self).default_setup() -# self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False) MEINTERFACE = CPPMEInterface RunCard = CPPRunCard From cf822a089cc65914ce2f475a7ba5077d8f2a5bb1 Mon Sep 17 00:00:00 2001 From: Olivier Mattelaer Date: Mon, 6 Nov 2023 13:54:02 +0100 Subject: [PATCH 02/28] better formatting for the card from the start and adding the possiblitity to choose avx --- MG5aMC/mg5amcnlo | 2 +- .../PLUGIN/CUDACPP_SA_OUTPUT/launch_plugin.py | 74 +++++++++++++++---- .../PLUGIN/CUDACPP_SA_OUTPUT/output.py | 9 ++- 3 files changed, 66 insertions(+), 19 deletions(-) diff --git a/MG5aMC/mg5amcnlo b/MG5aMC/mg5amcnlo index 49c93e01b8..8a18cc2423 160000 --- a/MG5aMC/mg5amcnlo +++ b/MG5aMC/mg5amcnlo @@ -1 +1 @@ -Subproject commit 49c93e01b8596cbdb4e65f628601de1e6f08c744 +Subproject commit 8a18cc2423616ee91c4f9d74eec0cb2901e0fd2a diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/launch_plugin.py b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/launch_plugin.py index 56c7ebaea4..120d30304b 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/launch_plugin.py +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/launch_plugin.py @@ -30,15 +30,17 @@ def compile(self, *args, **opts): import multiprocessing if not self.options['nb_core'] or self.options['nb_core'] == 'None': self.options['nb_core'] = multiprocessing.cpu_count() - if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'): - import pathlib - import os - pjoin = os.path.join - if 'cwd' in opts: - path = pjoin(opts['cwd'], os.pardir, os.pardir, 'Source', 'make_opts') - common_run_interface.CommonRunCmd.update_make_opts_full( - path, - {'FPTYPE': self.run_card['floating_type']}) + + if 'cwd' in opts and os.path.basename(opts['cwd']) == 'Source': + path = pjoin(opts['cwd'], 'make_opts') + avx_type = self.run_card['avx_type'] if self.run_card['avx_type'] != 'auto' else '' + common_run_interface.CommonRunCmd.update_make_opts_full(path, + {'FPTYPE': self.run_card['floating_type'], + 'AVX': avx_type }) + misc.sprint('FPTYPE checked') + + + if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'): cudacpp_backend = self.run_card['cudacpp_backend'].upper() # the default value is defined in banner.py logger.info("Building madevent in madevent_interface.py with '%s' matrix elements"%cudacpp_backend) if cudacpp_backend == 'FORTRAN': @@ -53,7 +55,26 @@ def compile(self, *args, **opts): else: return misc.compile(nb_core=self.options['nb_core'], *args, **opts) + +# Phase-Space Optimization ------------------------------------------------------------------------------------ +template_on = \ +"""#********************************************************************* +# SIMD/GPU Parametrization +#********************************************************************* + %(floating_type)s = floating_type ! single precision(f), double precision (d), mixed (m) [double for amplitude, single for color] + %(avx_type)s = avx_type ! for SIMD, technology to use for the vectorization + %(cudacpp_backend)s = cudacpp_backend ! Fortran/CPP/CUDA switch mode to use +""" + +template_off = '' + +plugin_block = banner_mod.RunBlock('simd', template_on=template_on, template_off=template_off) + + class CPPRunCard(banner_mod.RunCardLO): + + blocks = banner_mod.RunCardLO.blocks + [plugin_block] + def reset_simd(self, old_value, new_value, name): if not hasattr(self, 'path'): logger.warning('WARNING! CPPRunCard instance has no attribute path') @@ -67,12 +88,17 @@ def reset_simd(self, old_value, new_value, name): def reset_makeopts(self, old_value, new_value, name): - misc.sprint("PASS IN RESET MAKEOPTS", old_value, new_value, name) if not hasattr(self, 'path'): raise Exception + avx_value = self['avx_type'] if self['avx_type'] != 'auto' else '' + if name == 'floating_type': - common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': new_value}) + common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': new_value, 'AVX': avx_value}) + elif name == 'avx_type': + if new_value == 'Auto': + new_value = '' + common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': self['floating_type'], 'AVX': new_value}) else: raise Exception @@ -84,9 +110,22 @@ def plugin_input(self, finput): def default_setup(self): super().default_setup() - self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False) self.add_param('floating_type', 'd', include=False, hidden=False, - fct_mod=(self.reset_makeopts,(),{})) + fct_mod=(self.reset_makeopts,(),{}), + allowed=['m','d','f']) + self.add_param('avx_type', 'auto', include=False, hidden=False, + fct_mod=(self.reset_makeopts,(),{}), + allowed=['auto', 'none', 'sse4', 'avx2','512y','512z']) + self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False, + allowed=['Fortan', 'CPP', 'CUDA']) + self['vector_size'] = 16 # already setup in default class (just change value) + self['aloha_flag'] = '--fast-math' + self['matrix_flag'] = '-O3' + self.display_block.append('simd') + self.display_block.append('psoptim') + + +>>>>>>> 746f16476 (better formatting for the card from the start and adding the possiblitity to choose avx) def write_one_include_file(self, output_dir, incname, output_file=None): """write one include file at the time""" @@ -104,10 +143,13 @@ def check_validity(self): self['hel_recycling'] = False class GPURunCard(CPPRunCard): + def default_setup(self): - super(CPPRunCard, self).default_setup() - self.add_param('cudacpp_backend', 'CUDA', include=False, hidden=False) - self.add_param('floating_type', 'd', include=False, hidden=False) + + super().default_setup() + # change default value: + self['cudacpp_backend'] = 'CUDA' + self['vector_size'] = 16384 # already setup in default class (just change value) MEINTERFACE = CPPMEInterface diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py index 8961036fb1..80d0c7e8bc 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py @@ -149,6 +149,11 @@ class PLUGIN_ProcessExporter(PLUGIN_export_cpp.ProcessExporterGPU): ###helas_exporter = None helas_exporter = model_handling.PLUGIN_GPUFOHelasCallWriter # this is one of the main fixes for issue #341! + # Default class for the run_card to use + from . import launch_plugin + run_card_class = launch_plugin.CPPRunCard + + # AV (default from OM's tutorial) - add a debug printout def __init__(self, *args, **kwargs): self.in_madevent_mode = False # see MR #747 @@ -203,7 +208,6 @@ def finalize(self, matrix_element, cmdhistory, MG5options, outputflag): cmdhistory is the list of command used so far. MG5options are all the options of the main interface outputflags is a list of options provided when doing the output command""" - misc.sprint('Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self)) if self.in_madevent_mode: self.add_input_for_banner() if 'CUDACPP_CODEGEN_PATCHLEVEL' in os.environ: patchlevel = os.environ['CUDACPP_CODEGEN_PATCHLEVEL'] @@ -237,7 +241,8 @@ def finalize(self, matrix_element, cmdhistory, MG5options, outputflag): raise Exception('ERROR! the O/S call to patchMad.sh failed') # Additional patching (OM) self.add_madevent_plugin_fct() # Added by OM - return super().finalize(matrix_element, cmdhistory, MG5options, outputflag) + # do not call standard finalize since is this is already done... + #return super().finalize(matrix_element, cmdhistory, MG5options, outputflag) # AV (default from OM's tutorial) - overload settings and add a debug printout def modify_grouping(self, matrix_element): From 53e65d00b24b1484c4effa3b012f28dd11511160 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Thu, 9 Nov 2023 13:36:19 +0100 Subject: [PATCH 03/28] [floating_type_interface] in CODEGEN/generateAndCompare.sh, anticipate the checks for errors in codegen log before manipulating the generated code --- epochX/cudacpp/CODEGEN/generateAndCompare.sh | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/epochX/cudacpp/CODEGEN/generateAndCompare.sh b/epochX/cudacpp/CODEGEN/generateAndCompare.sh index 1f416ac5b9..7519fbcc7d 100755 --- a/epochX/cudacpp/CODEGEN/generateAndCompare.sh +++ b/epochX/cudacpp/CODEGEN/generateAndCompare.sh @@ -201,6 +201,16 @@ function codeGenAndDiff() cat ${outproc}_log.txt | egrep -v '(Crash Annotation)' > ${outproc}_log.txt.new # remove firefox 'glxtest: libEGL initialize failed' errors \mv ${outproc}_log.txt.new ${outproc}_log.txt fi + # Check the code generation log for errors + if [ -d ${outproc} ] && ! grep -q "Please report this bug" ${outproc}_log.txt; then + ###cat ${outproc}_log.txt; exit 0 # FOR DEBUGGING + cat ${MG5AMC_HOME}/${outproc}_log.txt | { egrep 'INFO: (Try|Creat|Organiz|Process)' || true; } + else + echo "*** ERROR! Code generation failed" + cat ${MG5AMC_HOME}/${outproc}_log.txt + echo "*** ERROR! Code generation failed" + exit 1 + fi # Patches moved here from patchMad.sh after Olivier's PR #764 (THIS IS ONLY NEEDED IN THE MADGRAPH4GPU GIT REPO) if [ "${OUTBCK}" == "mad" ]; then # Force the use of strategy SDE=1 in multichannel mode (see #419) @@ -244,16 +254,6 @@ function codeGenAndDiff() CPP = cudacpp_backend ! valid backends are FORTRAN, CPP, CUDA" >> ${outproc}/Cards/run_card.dat fi fi - # Check the code generation log for errors - if [ -d ${outproc} ] && ! grep -q "Please report this bug" ${outproc}_log.txt; then - ###cat ${outproc}_log.txt; exit 0 # FOR DEBUGGING - cat ${MG5AMC_HOME}/${outproc}_log.txt | { egrep 'INFO: (Try|Creat|Organiz|Process)' || true; } - else - echo "*** ERROR! Code generation failed" - cat ${MG5AMC_HOME}/${outproc}_log.txt - echo "*** ERROR! Code generation failed" - exit 1 - fi popd >& /dev/null # Choose which directory must be copied (for gridpack generation: untar and modify the gridpack) if [ "${SCRBCK}" == "gridpack" ]; then From 4513bae1730e01cb3087075c59531b377f759bf8 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Thu, 9 Nov 2023 13:46:50 +0100 Subject: [PATCH 04/28] [floating_type_interface] in CODEGEN launch_plugin.py, minor fix (and some cleanup) in PR #788 --- .../PLUGIN/CUDACPP_SA_OUTPUT/launch_plugin.py | 19 +------------------ 1 file changed, 1 insertion(+), 18 deletions(-) diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/launch_plugin.py b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/launch_plugin.py index 120d30304b..acc5826fdb 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/launch_plugin.py +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/launch_plugin.py @@ -29,8 +29,7 @@ def compile(self, *args, **opts): """ """ import multiprocessing if not self.options['nb_core'] or self.options['nb_core'] == 'None': - self.options['nb_core'] = multiprocessing.cpu_count() - + self.options['nb_core'] = multiprocessing.cpu_count() if 'cwd' in opts and os.path.basename(opts['cwd']) == 'Source': path = pjoin(opts['cwd'], 'make_opts') avx_type = self.run_card['avx_type'] if self.run_card['avx_type'] != 'auto' else '' @@ -38,8 +37,6 @@ def compile(self, *args, **opts): {'FPTYPE': self.run_card['floating_type'], 'AVX': avx_type }) misc.sprint('FPTYPE checked') - - if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'): cudacpp_backend = self.run_card['cudacpp_backend'].upper() # the default value is defined in banner.py logger.info("Building madevent in madevent_interface.py with '%s' matrix elements"%cudacpp_backend) @@ -55,7 +52,6 @@ def compile(self, *args, **opts): else: return misc.compile(nb_core=self.options['nb_core'], *args, **opts) - # Phase-Space Optimization ------------------------------------------------------------------------------------ template_on = \ """#********************************************************************* @@ -67,12 +63,9 @@ def compile(self, *args, **opts): """ template_off = '' - plugin_block = banner_mod.RunBlock('simd', template_on=template_on, template_off=template_off) - class CPPRunCard(banner_mod.RunCardLO): - blocks = banner_mod.RunCardLO.blocks + [plugin_block] def reset_simd(self, old_value, new_value, name): @@ -87,12 +80,9 @@ def reset_simd(self, old_value, new_value, name): subprocess.call(['make', 'cleanavx'], cwd=Sourcedir, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) def reset_makeopts(self, old_value, new_value, name): - if not hasattr(self, 'path'): raise Exception - avx_value = self['avx_type'] if self['avx_type'] != 'auto' else '' - if name == 'floating_type': common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': new_value, 'AVX': avx_value}) elif name == 'avx_type': @@ -101,7 +91,6 @@ def reset_makeopts(self, old_value, new_value, name): common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': self['floating_type'], 'AVX': new_value}) else: raise Exception - Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source') subprocess.call(['make', 'cleanavx'], cwd=Sourcedir, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) @@ -124,9 +113,6 @@ def default_setup(self): self.display_block.append('simd') self.display_block.append('psoptim') - ->>>>>>> 746f16476 (better formatting for the card from the start and adding the possiblitity to choose avx) - def write_one_include_file(self, output_dir, incname, output_file=None): """write one include file at the time""" if incname == "vector.inc" and 'vector_size' not in self.user_set: @@ -143,14 +129,11 @@ def check_validity(self): self['hel_recycling'] = False class GPURunCard(CPPRunCard): - def default_setup(self): - super().default_setup() # change default value: self['cudacpp_backend'] = 'CUDA' self['vector_size'] = 16384 # already setup in default class (just change value) - MEINTERFACE = CPPMEInterface RunCard = CPPRunCard From 484e35f05ba19a2e740bbe983ed0cfed3d479236 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Thu, 9 Nov 2023 14:00:51 +0100 Subject: [PATCH 05/28] [floating_type_interface] in .github/workflows/testsuite_oneprocess.sh, add some debug printouts about comparison of generated code --- .github/workflows/testsuite_oneprocess.sh | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/.github/workflows/testsuite_oneprocess.sh b/.github/workflows/testsuite_oneprocess.sh index 639991f1cb..ea3f294fc8 100755 --- a/.github/workflows/testsuite_oneprocess.sh +++ b/.github/workflows/testsuite_oneprocess.sh @@ -38,16 +38,20 @@ function codegen() { ###compare=true # enable comparison to current git repo compare=false # disable comparison to current git repo if [ ${compare} ] && [ "$(git ls-tree --name-only HEAD ${proc})" != "" ]; then + echo + echo "Compare newly generated code for ${proc} to that in the madgraph4gpu github repository" git checkout HEAD ${proc}/CODEGEN*.txt if [ "${proc%.mad}" != "${proc}" ]; then git checkout HEAD ${proc}/Cards/me5_configuration.txt ###sed -i 's/DEFAULT_F2PY_COMPILER=f2py.*/DEFAULT_F2PY_COMPILER=f2py3/' ${proc}/Source/make_opts git checkout HEAD ${proc}/Source/make_opts fi - echo echo "git diff (start)" git diff --exit-code echo "git diff (end)" + else + echo + echo "(SKIP comparison of newly generated code for ${proc} to that in the madgraph4gpu github repository)" fi } From fb22309b502c80a0a43755e81e018da572cbea0e Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Thu, 9 Nov 2023 14:13:15 +0100 Subject: [PATCH 06/28] [floating_type_interface] in .github/workflows/testsuite_allprocesses.yml, disable on:push triggers to avoid launching two jobs instead of one --- .github/workflows/testsuite_allprocesses.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/testsuite_allprocesses.yml b/.github/workflows/testsuite_allprocesses.yml index 7eaad09c9f..1d67a9efaf 100644 --- a/.github/workflows/testsuite_allprocesses.yml +++ b/.github/workflows/testsuite_allprocesses.yml @@ -19,8 +19,9 @@ on: branches: [ master ] # Trigger the all-processes workflow when new changes to the workflow are pushed - push: - paths: [ .github/workflows/testsuite* ] + # (NB: this is now disabled to avoid triggering two jobs when pushing to a branch for which a PR is opened) + ###push: + ### paths: [ .github/workflows/testsuite* ] #---------------------------------------------------------------------------------------------------------------------------------- From 437f9569686e43862be37c1020b5cb67ca83cb3e Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Thu, 9 Nov 2023 14:17:14 +0100 Subject: [PATCH 07/28] [floating_type_interface] in .github/workflows/testsuite_oneprocess.sh, fix a bash bug and disable comparisons to the existing repo --- .github/workflows/testsuite_oneprocess.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/testsuite_oneprocess.sh b/.github/workflows/testsuite_oneprocess.sh index ea3f294fc8..d49d70c200 100755 --- a/.github/workflows/testsuite_oneprocess.sh +++ b/.github/workflows/testsuite_oneprocess.sh @@ -35,9 +35,9 @@ function codegen() { ./CODEGEN/generateAndCompare.sh -q ${proc%.sa} fi # Check if there are any differences to the current repo - ###compare=true # enable comparison to current git repo - compare=false # disable comparison to current git repo - if [ ${compare} ] && [ "$(git ls-tree --name-only HEAD ${proc})" != "" ]; then + ###compare=1 # enable comparison to current git repo + compare=0 # disable comparison to current git repo + if [ "${compare}" != "0" ] && [ "$(git ls-tree --name-only HEAD ${proc})" != "" ]; then echo echo "Compare newly generated code for ${proc} to that in the madgraph4gpu github repository" git checkout HEAD ${proc}/CODEGEN*.txt From 7f50f2f685802e388140d988d1d6677495e0c89d Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Fri, 10 Nov 2023 07:26:51 +0100 Subject: [PATCH 08/28] [floating_type_interface] revert Olivier's changes to MG5AMC before merging master git checkout c803581e57ed157a108c71e3887930bd3338c73d ../../MG5aMC git submodule update --- MG5aMC/mg5amcnlo | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MG5aMC/mg5amcnlo b/MG5aMC/mg5amcnlo index 8a18cc2423..49c93e01b8 160000 --- a/MG5aMC/mg5amcnlo +++ b/MG5aMC/mg5amcnlo @@ -1 +1 @@ -Subproject commit 8a18cc2423616ee91c4f9d74eec0cb2901e0fd2a +Subproject commit 49c93e01b8596cbdb4e65f628601de1e6f08c744 From 5b4652b3813e41d4471364bc176a9a2a78e0010e Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Wed, 15 Nov 2023 13:45:23 +0100 Subject: [PATCH 09/28] [floating_type_interface] regenerate gg_tt.mad, but this has a different vector.inc (where comments have disappeared) that I want to debug - will revert --- .../gg_tt.mad/CODEGEN_mad_gg_tt_log.txt | 27 ++++--- epochX/cudacpp/gg_tt.mad/Cards/run_card.dat | 26 ++++++- .../gg_tt.mad/Cards/run_card_default.dat | 26 ++++++- epochX/cudacpp/gg_tt.mad/Source/vector.inc | 32 +-------- .../gg_tt.mad/bin/internal/launch_plugin.py | 71 +++++++++++++++---- 5 files changed, 122 insertions(+), 60 deletions(-) diff --git a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt index a1fa47508f..900f4f489d 100644 --- a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt +++ b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005694150924682617  +DEBUG: model prefixing takes 0.005335807800292969  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -162,10 +162,10 @@ Load PLUGIN.CUDACPP_OUTPUT Addition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT Output will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: cformat =  standalone_cudacpp [export_cpp.py at line 3071]  -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 155]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 160]  INFO: initialize a new directory: CODEGEN_mad_gg_tt INFO: remove old information in CODEGEN_mad_gg_tt -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 160]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 165]  WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt  INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards  @@ -175,7 +175,7 @@ INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1 INFO: Processing color information for process: g g > t t~ @1 INFO: Creating files in directory P1_gg_ttx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -191,16 +191,16 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. INFO: Generating Feynman diagrams for Process: g g > t t~ WEIGHTED<=2 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttx Generated helas calls for 1 subprocesses (3 diagrams) in 0.006 s -Wrote files for 10 helas calls in 0.101 s +Wrote files for 10 helas calls in 0.099 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 2 routines in 0.145 s -DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 197]  +ALOHA: aloha creates 2 routines in 0.144 s +DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 4 routines in 0.132 s +ALOHA: aloha creates 4 routines in 0.129 s VVV1 FFV1 FFV1 @@ -219,7 +219,6 @@ save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CO INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate web pages -DEBUG: 'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) =  Entering PLUGIN_ProcessExporter.finalize True [output.py at line 206]  DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common patching file Source/genps.inc patching file Source/makefile @@ -231,16 +230,16 @@ DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/S patching file auto_dsig1.f patching file driver.f patching file matrix1.f -DEBUG: p.returncode =  0 [output.py at line 232]  +DEBUG: p.returncode =  0 [output.py at line 237]  Output to directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt done. Type "launch" to generate events from this process, or see /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/README Run "open index.html" to see more information about this process. quit -real 0m4.772s -user 0m1.470s -sys 0m0.223s +real 0m1.670s +user 0m1.458s +sys 0m0.213s ************************************************************ * * * W E L C O M E to * @@ -268,8 +267,6 @@ Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt No valid web browser found. Please set in ./input/mg5_configuration.txt treatcards run -run_card missed argument cudacpp_backend. Takes default: CPP -run_card missed argument cudacpp_backend. Takes default: CPP WARNING! CPPRunCard instance has no attribute path quit INFO: diff --git a/epochX/cudacpp/gg_tt.mad/Cards/run_card.dat b/epochX/cudacpp/gg_tt.mad/Cards/run_card.dat index 80d07f317e..46d89bbc4f 100644 --- a/epochX/cudacpp/gg_tt.mad/Cards/run_card.dat +++ b/epochX/cudacpp/gg_tt.mad/Cards/run_card.dat @@ -80,7 +80,23 @@ 1 = sde_strategy ! default integration strategy (hep-ph/2021.00773) ! 1 is old strategy (using amp square) ! 2 is new strategy (using only the denominator) -# To see advanced option for Phase-Space optimization: type "update psoptim" +#********************************************************************* +# Phase-Space Optim (advanced) +#********************************************************************* + 0 = job_strategy ! see appendix of 1507.00020 (page 26) + 0 = hard_survey ! force to have better estimate of the integral at survey for difficult mode like interference + -1.0 = tmin_for_channel ! limit the non-singular reach of --some-- channel of integration related to T-channel diagram (value between -1 and 0), -1 is no impact + -1 = survey_splitting ! for loop-induced control how many core are used at survey for the computation of a single iteration. + 2 = survey_nchannel_per_job ! control how many Channel are integrated inside a single job on cluster/multicore + -1 = refine_evt_by_job ! control the maximal number of events for the first iteration of the refine (larger means less jobs) +#********************************************************************* +# Compilation flag. +#********************************************************************* + -O = global_flag ! fortran optimization flag use for the all code. + --fast-math = aloha_flag ! fortran optimization flag for aloha function. Suggestions: '-ffast-math' + -O3 = matrix_flag ! fortran optimization flag for matrix.f function. Suggestions: '-O3' + 16 = vector_size ! size designed for SIMD/OpenMP/GPU (number of events in lockstep) + #********************************************************************* # Customization (custom cuts/scale/bias/...) * # list of files containing fortran function that overwrite default * @@ -140,6 +156,14 @@ systematics = systematics_program ! none, systematics [python], SysCalc [depreceted, C++] ['--mur=0.5,1,2', '--muf=0.5,1,2', '--pdf=errorset'] = systematics_arguments ! see: https://cp3.irmp.ucl.ac.be/projects/madgraph/wiki/Systematics#Systematicspythonmodule +#********************************************************************* +# SIMD/GPU Parametrization +#********************************************************************* + d = floating_type ! single precision(f), double precision (d), mixed (m) [double for amplitude, single for color] + auto = avx_type ! for SIMD, technology to use for the vectorization + CPP = cudacpp_backend ! Fortran/CPP/CUDA switch mode to use + + #********************************************************************* # Options for the cudacpp plugin #********************************************************************* diff --git a/epochX/cudacpp/gg_tt.mad/Cards/run_card_default.dat b/epochX/cudacpp/gg_tt.mad/Cards/run_card_default.dat index 48d84c73f0..7697a3f769 100644 --- a/epochX/cudacpp/gg_tt.mad/Cards/run_card_default.dat +++ b/epochX/cudacpp/gg_tt.mad/Cards/run_card_default.dat @@ -80,7 +80,23 @@ 2 = sde_strategy ! default integration strategy (hep-ph/2021.00773) ! 1 is old strategy (using amp square) ! 2 is new strategy (using only the denominator) -# To see advanced option for Phase-Space optimization: type "update psoptim" +#********************************************************************* +# Phase-Space Optim (advanced) +#********************************************************************* + 0 = job_strategy ! see appendix of 1507.00020 (page 26) + 0 = hard_survey ! force to have better estimate of the integral at survey for difficult mode like interference + -1.0 = tmin_for_channel ! limit the non-singular reach of --some-- channel of integration related to T-channel diagram (value between -1 and 0), -1 is no impact + -1 = survey_splitting ! for loop-induced control how many core are used at survey for the computation of a single iteration. + 2 = survey_nchannel_per_job ! control how many Channel are integrated inside a single job on cluster/multicore + -1 = refine_evt_by_job ! control the maximal number of events for the first iteration of the refine (larger means less jobs) +#********************************************************************* +# Compilation flag. +#********************************************************************* + -O = global_flag ! fortran optimization flag use for the all code. + --fast-math = aloha_flag ! fortran optimization flag for aloha function. Suggestions: '-ffast-math' + -O3 = matrix_flag ! fortran optimization flag for matrix.f function. Suggestions: '-O3' + 16 = vector_size ! size designed for SIMD/OpenMP/GPU (number of events in lockstep) + #********************************************************************* # Customization (custom cuts/scale/bias/...) * # list of files containing fortran function that overwrite default * @@ -139,3 +155,11 @@ # systematics = systematics_program ! none, systematics [python], SysCalc [depreceted, C++] ['--mur=0.5,1,2', '--muf=0.5,1,2', '--pdf=errorset'] = systematics_arguments ! see: https://cp3.irmp.ucl.ac.be/projects/madgraph/wiki/Systematics#Systematicspythonmodule + +#********************************************************************* +# SIMD/GPU Parametrization +#********************************************************************* + d = floating_type ! single precision(f), double precision (d), mixed (m) [double for amplitude, single for color] + auto = avx_type ! for SIMD, technology to use for the vectorization + CPP = cudacpp_backend ! Fortran/CPP/CUDA switch mode to use + diff --git a/epochX/cudacpp/gg_tt.mad/Source/vector.inc b/epochX/cudacpp/gg_tt.mad/Source/vector.inc index 863eebbc70..9eab485b2b 100644 --- a/epochX/cudacpp/gg_tt.mad/Source/vector.inc +++ b/epochX/cudacpp/gg_tt.mad/Source/vector.inc @@ -1,31 +1,3 @@ -C -C If VECSIZE_MEMMAX is greater than 1, a vector API is used: -C this is designed for offloading MEs to GPUs or vectorized C++, -C but it can also be used for computing MEs in Fortran. -C If VECSIZE_MEMMAX equals 1, the old scalar API is used: -C this can only be used for computing MEs in Fortran. -C -C Fortran arrays in the vector API can hold up to VECSIZE_MEMMAX -C events and are statically allocated at compile time. -C The constant value of VECSIZE_MEMMAX is fixed at codegen time -C (output madevent ... --vector_size=). -C -C While the arrays can hold up to VECSIZE_MEMMAX events, -C only VECSIZE_USED (<= VECSIZE_MEMAMX) are used in Fortran loops. -C The value of VECSIZE_USED can be chosen at runtime -C (typically 8k-16k for GPUs, 16-32 for vectorized C++). -C -C The value of VECSIZE_USED represents the number of events -C handled by one call to the Fortran/cudacpp "bridge". -C This is not necessarily the number of events which are -C processed in lockstep within a single SIMD vector on CPUs -C or within a single "warp" of threads on GPUs. These parameters -C are internal to the cudacpp bridge and need not be exposed -C to the Fortran program which calls the cudacpp bridge. -C -C NB: THIS FILE CANNOT CONTAIN #ifdef DIRECTIVES -C BECAUSE IT DOES NOT GO THROUGH THE CPP PREPROCESSOR -C (see https://github.com/madgraph5/madgraph4gpu/issues/458). -C INTEGER VECSIZE_MEMMAX - PARAMETER (VECSIZE_MEMMAX=16384) + + PARAMETER (VECSIZE_MEMMAX=16) diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/gg_tt.mad/bin/internal/launch_plugin.py index 0b849330ef..acc5826fdb 100644 --- a/epochX/cudacpp/gg_tt.mad/bin/internal/launch_plugin.py +++ b/epochX/cudacpp/gg_tt.mad/bin/internal/launch_plugin.py @@ -16,22 +16,28 @@ import internal.misc as misc import internal.extended_cmd as extended_cmd import internal.banner as banner_mod + import internal.common_run_interface as common_run_interface else: import madgraph.interface.madevent_interface as madevent_interface import madgraph.various.misc as misc import madgraph.interface.extended_cmd as extended_cmd import madgraph.various.banner as banner_mod + import madgraph.interface.common_run_interface as common_run_interface class CPPMEInterface(madevent_interface.MadEventCmdShell): def compile(self, *args, **opts): """ """ import multiprocessing if not self.options['nb_core'] or self.options['nb_core'] == 'None': - self.options['nb_core'] = multiprocessing.cpu_count() - if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'): - import pathlib - import os - pjoin = os.path.join + self.options['nb_core'] = multiprocessing.cpu_count() + if 'cwd' in opts and os.path.basename(opts['cwd']) == 'Source': + path = pjoin(opts['cwd'], 'make_opts') + avx_type = self.run_card['avx_type'] if self.run_card['avx_type'] != 'auto' else '' + common_run_interface.CommonRunCmd.update_make_opts_full(path, + {'FPTYPE': self.run_card['floating_type'], + 'AVX': avx_type }) + misc.sprint('FPTYPE checked') + if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'): cudacpp_backend = self.run_card['cudacpp_backend'].upper() # the default value is defined in banner.py logger.info("Building madevent in madevent_interface.py with '%s' matrix elements"%cudacpp_backend) if cudacpp_backend == 'FORTRAN': @@ -46,7 +52,22 @@ def compile(self, *args, **opts): else: return misc.compile(nb_core=self.options['nb_core'], *args, **opts) +# Phase-Space Optimization ------------------------------------------------------------------------------------ +template_on = \ +"""#********************************************************************* +# SIMD/GPU Parametrization +#********************************************************************* + %(floating_type)s = floating_type ! single precision(f), double precision (d), mixed (m) [double for amplitude, single for color] + %(avx_type)s = avx_type ! for SIMD, technology to use for the vectorization + %(cudacpp_backend)s = cudacpp_backend ! Fortran/CPP/CUDA switch mode to use +""" + +template_off = '' +plugin_block = banner_mod.RunBlock('simd', template_on=template_on, template_off=template_off) + class CPPRunCard(banner_mod.RunCardLO): + blocks = banner_mod.RunCardLO.blocks + [plugin_block] + def reset_simd(self, old_value, new_value, name): if not hasattr(self, 'path'): logger.warning('WARNING! CPPRunCard instance has no attribute path') @@ -58,12 +79,39 @@ def reset_simd(self, old_value, new_value, name): Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source') subprocess.call(['make', 'cleanavx'], cwd=Sourcedir, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) + def reset_makeopts(self, old_value, new_value, name): + if not hasattr(self, 'path'): + raise Exception + avx_value = self['avx_type'] if self['avx_type'] != 'auto' else '' + if name == 'floating_type': + common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': new_value, 'AVX': avx_value}) + elif name == 'avx_type': + if new_value == 'Auto': + new_value = '' + common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': self['floating_type'], 'AVX': new_value}) + else: + raise Exception + Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source') + subprocess.call(['make', 'cleanavx'], cwd=Sourcedir, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) + def plugin_input(self, finput): return def default_setup(self): super().default_setup() - self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False) + self.add_param('floating_type', 'd', include=False, hidden=False, + fct_mod=(self.reset_makeopts,(),{}), + allowed=['m','d','f']) + self.add_param('avx_type', 'auto', include=False, hidden=False, + fct_mod=(self.reset_makeopts,(),{}), + allowed=['auto', 'none', 'sse4', 'avx2','512y','512z']) + self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False, + allowed=['Fortan', 'CPP', 'CUDA']) + self['vector_size'] = 16 # already setup in default class (just change value) + self['aloha_flag'] = '--fast-math' + self['matrix_flag'] = '-O3' + self.display_block.append('simd') + self.display_block.append('psoptim') def write_one_include_file(self, output_dir, incname, output_file=None): """write one include file at the time""" @@ -82,13 +130,10 @@ def check_validity(self): class GPURunCard(CPPRunCard): def default_setup(self): - super(CPPRunCard, self).default_setup() - self.add_param('cudacpp_backend', 'CUDA', include=False, hidden=False) - -#class CUDACPPRunCard(CPPRunCard): -# def default_setup(self): -# super(CPPRunCard, self).default_setup() -# self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False) + super().default_setup() + # change default value: + self['cudacpp_backend'] = 'CUDA' + self['vector_size'] = 16384 # already setup in default class (just change value) MEINTERFACE = CPPMEInterface RunCard = CPPRunCard From 7ed73c3272ccd5126d6a5468ea3956e4416fa919 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Wed, 15 Nov 2023 13:46:06 +0100 Subject: [PATCH 10/28] [floating_type_interface] revert to the older gg_tt.mad, while I debug vector.inc Revert "[floating_type_interface] regenerate gg_tt.mad, but this has a different vector.inc (where comments have disappeared) that I want to debug - will revert" This reverts commit 5b4652b3813e41d4471364bc176a9a2a78e0010e --- .../gg_tt.mad/CODEGEN_mad_gg_tt_log.txt | 27 +++---- epochX/cudacpp/gg_tt.mad/Cards/run_card.dat | 26 +------ .../gg_tt.mad/Cards/run_card_default.dat | 26 +------ epochX/cudacpp/gg_tt.mad/Source/vector.inc | 32 ++++++++- .../gg_tt.mad/bin/internal/launch_plugin.py | 71 ++++--------------- 5 files changed, 60 insertions(+), 122 deletions(-) diff --git a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt index 900f4f489d..a1fa47508f 100644 --- a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt +++ b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005335807800292969  +DEBUG: model prefixing takes 0.005694150924682617  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -162,10 +162,10 @@ Load PLUGIN.CUDACPP_OUTPUT Addition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT Output will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: cformat =  standalone_cudacpp [export_cpp.py at line 3071]  -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 160]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 155]  INFO: initialize a new directory: CODEGEN_mad_gg_tt INFO: remove old information in CODEGEN_mad_gg_tt -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 165]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 160]  WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt  INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards  @@ -175,7 +175,7 @@ INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1 INFO: Processing color information for process: g g > t t~ @1 INFO: Creating files in directory P1_gg_ttx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -191,16 +191,16 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. INFO: Generating Feynman diagrams for Process: g g > t t~ WEIGHTED<=2 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttx Generated helas calls for 1 subprocesses (3 diagrams) in 0.006 s -Wrote files for 10 helas calls in 0.099 s +Wrote files for 10 helas calls in 0.101 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 2 routines in 0.144 s -DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202]  +ALOHA: aloha creates 2 routines in 0.145 s +DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 197]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 4 routines in 0.129 s +ALOHA: aloha creates 4 routines in 0.132 s VVV1 FFV1 FFV1 @@ -219,6 +219,7 @@ save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CO INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate web pages +DEBUG: 'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) =  Entering PLUGIN_ProcessExporter.finalize True [output.py at line 206]  DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common patching file Source/genps.inc patching file Source/makefile @@ -230,16 +231,16 @@ DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/S patching file auto_dsig1.f patching file driver.f patching file matrix1.f -DEBUG: p.returncode =  0 [output.py at line 237]  +DEBUG: p.returncode =  0 [output.py at line 232]  Output to directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt done. Type "launch" to generate events from this process, or see /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/README Run "open index.html" to see more information about this process. quit -real 0m1.670s -user 0m1.458s -sys 0m0.213s +real 0m4.772s +user 0m1.470s +sys 0m0.223s ************************************************************ * * * W E L C O M E to * @@ -267,6 +268,8 @@ Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt No valid web browser found. Please set in ./input/mg5_configuration.txt treatcards run +run_card missed argument cudacpp_backend. Takes default: CPP +run_card missed argument cudacpp_backend. Takes default: CPP WARNING! CPPRunCard instance has no attribute path quit INFO: diff --git a/epochX/cudacpp/gg_tt.mad/Cards/run_card.dat b/epochX/cudacpp/gg_tt.mad/Cards/run_card.dat index 46d89bbc4f..80d07f317e 100644 --- a/epochX/cudacpp/gg_tt.mad/Cards/run_card.dat +++ b/epochX/cudacpp/gg_tt.mad/Cards/run_card.dat @@ -80,23 +80,7 @@ 1 = sde_strategy ! default integration strategy (hep-ph/2021.00773) ! 1 is old strategy (using amp square) ! 2 is new strategy (using only the denominator) -#********************************************************************* -# Phase-Space Optim (advanced) -#********************************************************************* - 0 = job_strategy ! see appendix of 1507.00020 (page 26) - 0 = hard_survey ! force to have better estimate of the integral at survey for difficult mode like interference - -1.0 = tmin_for_channel ! limit the non-singular reach of --some-- channel of integration related to T-channel diagram (value between -1 and 0), -1 is no impact - -1 = survey_splitting ! for loop-induced control how many core are used at survey for the computation of a single iteration. - 2 = survey_nchannel_per_job ! control how many Channel are integrated inside a single job on cluster/multicore - -1 = refine_evt_by_job ! control the maximal number of events for the first iteration of the refine (larger means less jobs) -#********************************************************************* -# Compilation flag. -#********************************************************************* - -O = global_flag ! fortran optimization flag use for the all code. - --fast-math = aloha_flag ! fortran optimization flag for aloha function. Suggestions: '-ffast-math' - -O3 = matrix_flag ! fortran optimization flag for matrix.f function. Suggestions: '-O3' - 16 = vector_size ! size designed for SIMD/OpenMP/GPU (number of events in lockstep) - +# To see advanced option for Phase-Space optimization: type "update psoptim" #********************************************************************* # Customization (custom cuts/scale/bias/...) * # list of files containing fortran function that overwrite default * @@ -156,14 +140,6 @@ systematics = systematics_program ! none, systematics [python], SysCalc [depreceted, C++] ['--mur=0.5,1,2', '--muf=0.5,1,2', '--pdf=errorset'] = systematics_arguments ! see: https://cp3.irmp.ucl.ac.be/projects/madgraph/wiki/Systematics#Systematicspythonmodule -#********************************************************************* -# SIMD/GPU Parametrization -#********************************************************************* - d = floating_type ! single precision(f), double precision (d), mixed (m) [double for amplitude, single for color] - auto = avx_type ! for SIMD, technology to use for the vectorization - CPP = cudacpp_backend ! Fortran/CPP/CUDA switch mode to use - - #********************************************************************* # Options for the cudacpp plugin #********************************************************************* diff --git a/epochX/cudacpp/gg_tt.mad/Cards/run_card_default.dat b/epochX/cudacpp/gg_tt.mad/Cards/run_card_default.dat index 7697a3f769..48d84c73f0 100644 --- a/epochX/cudacpp/gg_tt.mad/Cards/run_card_default.dat +++ b/epochX/cudacpp/gg_tt.mad/Cards/run_card_default.dat @@ -80,23 +80,7 @@ 2 = sde_strategy ! default integration strategy (hep-ph/2021.00773) ! 1 is old strategy (using amp square) ! 2 is new strategy (using only the denominator) -#********************************************************************* -# Phase-Space Optim (advanced) -#********************************************************************* - 0 = job_strategy ! see appendix of 1507.00020 (page 26) - 0 = hard_survey ! force to have better estimate of the integral at survey for difficult mode like interference - -1.0 = tmin_for_channel ! limit the non-singular reach of --some-- channel of integration related to T-channel diagram (value between -1 and 0), -1 is no impact - -1 = survey_splitting ! for loop-induced control how many core are used at survey for the computation of a single iteration. - 2 = survey_nchannel_per_job ! control how many Channel are integrated inside a single job on cluster/multicore - -1 = refine_evt_by_job ! control the maximal number of events for the first iteration of the refine (larger means less jobs) -#********************************************************************* -# Compilation flag. -#********************************************************************* - -O = global_flag ! fortran optimization flag use for the all code. - --fast-math = aloha_flag ! fortran optimization flag for aloha function. Suggestions: '-ffast-math' - -O3 = matrix_flag ! fortran optimization flag for matrix.f function. Suggestions: '-O3' - 16 = vector_size ! size designed for SIMD/OpenMP/GPU (number of events in lockstep) - +# To see advanced option for Phase-Space optimization: type "update psoptim" #********************************************************************* # Customization (custom cuts/scale/bias/...) * # list of files containing fortran function that overwrite default * @@ -155,11 +139,3 @@ # systematics = systematics_program ! none, systematics [python], SysCalc [depreceted, C++] ['--mur=0.5,1,2', '--muf=0.5,1,2', '--pdf=errorset'] = systematics_arguments ! see: https://cp3.irmp.ucl.ac.be/projects/madgraph/wiki/Systematics#Systematicspythonmodule - -#********************************************************************* -# SIMD/GPU Parametrization -#********************************************************************* - d = floating_type ! single precision(f), double precision (d), mixed (m) [double for amplitude, single for color] - auto = avx_type ! for SIMD, technology to use for the vectorization - CPP = cudacpp_backend ! Fortran/CPP/CUDA switch mode to use - diff --git a/epochX/cudacpp/gg_tt.mad/Source/vector.inc b/epochX/cudacpp/gg_tt.mad/Source/vector.inc index 9eab485b2b..863eebbc70 100644 --- a/epochX/cudacpp/gg_tt.mad/Source/vector.inc +++ b/epochX/cudacpp/gg_tt.mad/Source/vector.inc @@ -1,3 +1,31 @@ +C +C If VECSIZE_MEMMAX is greater than 1, a vector API is used: +C this is designed for offloading MEs to GPUs or vectorized C++, +C but it can also be used for computing MEs in Fortran. +C If VECSIZE_MEMMAX equals 1, the old scalar API is used: +C this can only be used for computing MEs in Fortran. +C +C Fortran arrays in the vector API can hold up to VECSIZE_MEMMAX +C events and are statically allocated at compile time. +C The constant value of VECSIZE_MEMMAX is fixed at codegen time +C (output madevent ... --vector_size=). +C +C While the arrays can hold up to VECSIZE_MEMMAX events, +C only VECSIZE_USED (<= VECSIZE_MEMAMX) are used in Fortran loops. +C The value of VECSIZE_USED can be chosen at runtime +C (typically 8k-16k for GPUs, 16-32 for vectorized C++). +C +C The value of VECSIZE_USED represents the number of events +C handled by one call to the Fortran/cudacpp "bridge". +C This is not necessarily the number of events which are +C processed in lockstep within a single SIMD vector on CPUs +C or within a single "warp" of threads on GPUs. These parameters +C are internal to the cudacpp bridge and need not be exposed +C to the Fortran program which calls the cudacpp bridge. +C +C NB: THIS FILE CANNOT CONTAIN #ifdef DIRECTIVES +C BECAUSE IT DOES NOT GO THROUGH THE CPP PREPROCESSOR +C (see https://github.com/madgraph5/madgraph4gpu/issues/458). +C INTEGER VECSIZE_MEMMAX - - PARAMETER (VECSIZE_MEMMAX=16) + PARAMETER (VECSIZE_MEMMAX=16384) diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/gg_tt.mad/bin/internal/launch_plugin.py index acc5826fdb..0b849330ef 100644 --- a/epochX/cudacpp/gg_tt.mad/bin/internal/launch_plugin.py +++ b/epochX/cudacpp/gg_tt.mad/bin/internal/launch_plugin.py @@ -16,28 +16,22 @@ import internal.misc as misc import internal.extended_cmd as extended_cmd import internal.banner as banner_mod - import internal.common_run_interface as common_run_interface else: import madgraph.interface.madevent_interface as madevent_interface import madgraph.various.misc as misc import madgraph.interface.extended_cmd as extended_cmd import madgraph.various.banner as banner_mod - import madgraph.interface.common_run_interface as common_run_interface class CPPMEInterface(madevent_interface.MadEventCmdShell): def compile(self, *args, **opts): """ """ import multiprocessing if not self.options['nb_core'] or self.options['nb_core'] == 'None': - self.options['nb_core'] = multiprocessing.cpu_count() - if 'cwd' in opts and os.path.basename(opts['cwd']) == 'Source': - path = pjoin(opts['cwd'], 'make_opts') - avx_type = self.run_card['avx_type'] if self.run_card['avx_type'] != 'auto' else '' - common_run_interface.CommonRunCmd.update_make_opts_full(path, - {'FPTYPE': self.run_card['floating_type'], - 'AVX': avx_type }) - misc.sprint('FPTYPE checked') - if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'): + self.options['nb_core'] = multiprocessing.cpu_count() + if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'): + import pathlib + import os + pjoin = os.path.join cudacpp_backend = self.run_card['cudacpp_backend'].upper() # the default value is defined in banner.py logger.info("Building madevent in madevent_interface.py with '%s' matrix elements"%cudacpp_backend) if cudacpp_backend == 'FORTRAN': @@ -52,22 +46,7 @@ def compile(self, *args, **opts): else: return misc.compile(nb_core=self.options['nb_core'], *args, **opts) -# Phase-Space Optimization ------------------------------------------------------------------------------------ -template_on = \ -"""#********************************************************************* -# SIMD/GPU Parametrization -#********************************************************************* - %(floating_type)s = floating_type ! single precision(f), double precision (d), mixed (m) [double for amplitude, single for color] - %(avx_type)s = avx_type ! for SIMD, technology to use for the vectorization - %(cudacpp_backend)s = cudacpp_backend ! Fortran/CPP/CUDA switch mode to use -""" - -template_off = '' -plugin_block = banner_mod.RunBlock('simd', template_on=template_on, template_off=template_off) - class CPPRunCard(banner_mod.RunCardLO): - blocks = banner_mod.RunCardLO.blocks + [plugin_block] - def reset_simd(self, old_value, new_value, name): if not hasattr(self, 'path'): logger.warning('WARNING! CPPRunCard instance has no attribute path') @@ -79,39 +58,12 @@ def reset_simd(self, old_value, new_value, name): Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source') subprocess.call(['make', 'cleanavx'], cwd=Sourcedir, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) - def reset_makeopts(self, old_value, new_value, name): - if not hasattr(self, 'path'): - raise Exception - avx_value = self['avx_type'] if self['avx_type'] != 'auto' else '' - if name == 'floating_type': - common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': new_value, 'AVX': avx_value}) - elif name == 'avx_type': - if new_value == 'Auto': - new_value = '' - common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': self['floating_type'], 'AVX': new_value}) - else: - raise Exception - Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source') - subprocess.call(['make', 'cleanavx'], cwd=Sourcedir, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) - def plugin_input(self, finput): return def default_setup(self): super().default_setup() - self.add_param('floating_type', 'd', include=False, hidden=False, - fct_mod=(self.reset_makeopts,(),{}), - allowed=['m','d','f']) - self.add_param('avx_type', 'auto', include=False, hidden=False, - fct_mod=(self.reset_makeopts,(),{}), - allowed=['auto', 'none', 'sse4', 'avx2','512y','512z']) - self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False, - allowed=['Fortan', 'CPP', 'CUDA']) - self['vector_size'] = 16 # already setup in default class (just change value) - self['aloha_flag'] = '--fast-math' - self['matrix_flag'] = '-O3' - self.display_block.append('simd') - self.display_block.append('psoptim') + self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False) def write_one_include_file(self, output_dir, incname, output_file=None): """write one include file at the time""" @@ -130,10 +82,13 @@ def check_validity(self): class GPURunCard(CPPRunCard): def default_setup(self): - super().default_setup() - # change default value: - self['cudacpp_backend'] = 'CUDA' - self['vector_size'] = 16384 # already setup in default class (just change value) + super(CPPRunCard, self).default_setup() + self.add_param('cudacpp_backend', 'CUDA', include=False, hidden=False) + +#class CUDACPPRunCard(CPPRunCard): +# def default_setup(self): +# super(CPPRunCard, self).default_setup() +# self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False) MEINTERFACE = CPPMEInterface RunCard = CPPRunCard From 206d9efb66b15afda703e3b6f9d052a48e165a55 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Wed, 15 Nov 2023 15:54:21 +0100 Subject: [PATCH 11/28] [floating_type_interface] in CODEGEN launch_plugin.py, ensure that the comments in vector.inc are kept when 'madevent treatcards' is called Note: vector.inc is initially created via export_v4.py, which is not available to treatcards. Then vector.inc is modified by banner.py, whose write_one_include_file is overloaded in launch_plugin.py. --- .../PLUGIN/CUDACPP_SA_OUTPUT/launch_plugin.py | 20 ++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/launch_plugin.py b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/launch_plugin.py index acc5826fdb..3527be0e2c 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/launch_plugin.py +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/launch_plugin.py @@ -113,11 +113,25 @@ def default_setup(self): self.display_block.append('simd') self.display_block.append('psoptim') + # OM/AV - overload the default version in banner.py def write_one_include_file(self, output_dir, incname, output_file=None): """write one include file at the time""" - if incname == "vector.inc" and 'vector_size' not in self.user_set: - return - super().write_one_include_file(output_dir, incname, output_file) + if incname == "vector.inc": + if 'vector_size' not in self.user_set: return + if output_file is None: vectorinc=pjoin(output_dir,incname) + else: vectorinc=output_file + with open(vectorinc+'.new','w') as fileout: + with open(vectorinc) as filein: + for line in filein: + if line.startswith('C'): fileout.write(line) + super().write_one_include_file(output_dir, incname, output_file) + with open(vectorinc+'.new','a') as fileout: + with open(vectorinc) as filein: + for line in filein: + if not line.startswith('\n'): fileout.write(line) + os.replace(vectorinc+'.new',vectorinc) + else: + super().write_one_include_file(output_dir, incname, output_file) def check_validity(self): """ensure that PLUGIN information are consistent""" From 64f2b176dbcff9d5859a6559b58d72ae5493f9dc Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Wed, 15 Nov 2023 16:25:13 +0100 Subject: [PATCH 12/28] [floating_type_interface] regenerate gg_tt.mad, the only change in vector.inc is now that VECSIZE_MEMMAX is 16 instead of 16384 --- .../gg_tt.mad/CODEGEN_mad_gg_tt_log.txt | 27 +++--- epochX/cudacpp/gg_tt.mad/Cards/run_card.dat | 26 +++++- .../gg_tt.mad/Cards/run_card_default.dat | 26 +++++- epochX/cudacpp/gg_tt.mad/Source/vector.inc | 2 +- .../gg_tt.mad/bin/internal/launch_plugin.py | 91 +++++++++++++++---- 5 files changed, 138 insertions(+), 34 deletions(-) diff --git a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt index a1fa47508f..bae9fd6d79 100644 --- a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt +++ b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005694150924682617  +DEBUG: model prefixing takes 0.005484580993652344  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -162,10 +162,10 @@ Load PLUGIN.CUDACPP_OUTPUT Addition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT Output will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: cformat =  standalone_cudacpp [export_cpp.py at line 3071]  -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 155]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 160]  INFO: initialize a new directory: CODEGEN_mad_gg_tt INFO: remove old information in CODEGEN_mad_gg_tt -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 160]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 165]  WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt  INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards  @@ -175,7 +175,7 @@ INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1 INFO: Processing color information for process: g g > t t~ @1 INFO: Creating files in directory P1_gg_ttx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -191,16 +191,16 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. INFO: Generating Feynman diagrams for Process: g g > t t~ WEIGHTED<=2 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttx Generated helas calls for 1 subprocesses (3 diagrams) in 0.006 s -Wrote files for 10 helas calls in 0.101 s +Wrote files for 10 helas calls in 0.100 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 2 routines in 0.145 s -DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 197]  +ALOHA: aloha creates 2 routines in 0.142 s +DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 4 routines in 0.132 s +ALOHA: aloha creates 4 routines in 0.130 s VVV1 FFV1 FFV1 @@ -219,7 +219,6 @@ save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CO INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate web pages -DEBUG: 'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) =  Entering PLUGIN_ProcessExporter.finalize True [output.py at line 206]  DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common patching file Source/genps.inc patching file Source/makefile @@ -231,16 +230,16 @@ DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/S patching file auto_dsig1.f patching file driver.f patching file matrix1.f -DEBUG: p.returncode =  0 [output.py at line 232]  +DEBUG: p.returncode =  0 [output.py at line 237]  Output to directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt done. Type "launch" to generate events from this process, or see /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/README Run "open index.html" to see more information about this process. quit -real 0m4.772s -user 0m1.470s -sys 0m0.223s +real 0m1.677s +user 0m1.472s +sys 0m0.204s ************************************************************ * * * W E L C O M E to * @@ -268,8 +267,6 @@ Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt No valid web browser found. Please set in ./input/mg5_configuration.txt treatcards run -run_card missed argument cudacpp_backend. Takes default: CPP -run_card missed argument cudacpp_backend. Takes default: CPP WARNING! CPPRunCard instance has no attribute path quit INFO: diff --git a/epochX/cudacpp/gg_tt.mad/Cards/run_card.dat b/epochX/cudacpp/gg_tt.mad/Cards/run_card.dat index 80d07f317e..46d89bbc4f 100644 --- a/epochX/cudacpp/gg_tt.mad/Cards/run_card.dat +++ b/epochX/cudacpp/gg_tt.mad/Cards/run_card.dat @@ -80,7 +80,23 @@ 1 = sde_strategy ! default integration strategy (hep-ph/2021.00773) ! 1 is old strategy (using amp square) ! 2 is new strategy (using only the denominator) -# To see advanced option for Phase-Space optimization: type "update psoptim" +#********************************************************************* +# Phase-Space Optim (advanced) +#********************************************************************* + 0 = job_strategy ! see appendix of 1507.00020 (page 26) + 0 = hard_survey ! force to have better estimate of the integral at survey for difficult mode like interference + -1.0 = tmin_for_channel ! limit the non-singular reach of --some-- channel of integration related to T-channel diagram (value between -1 and 0), -1 is no impact + -1 = survey_splitting ! for loop-induced control how many core are used at survey for the computation of a single iteration. + 2 = survey_nchannel_per_job ! control how many Channel are integrated inside a single job on cluster/multicore + -1 = refine_evt_by_job ! control the maximal number of events for the first iteration of the refine (larger means less jobs) +#********************************************************************* +# Compilation flag. +#********************************************************************* + -O = global_flag ! fortran optimization flag use for the all code. + --fast-math = aloha_flag ! fortran optimization flag for aloha function. Suggestions: '-ffast-math' + -O3 = matrix_flag ! fortran optimization flag for matrix.f function. Suggestions: '-O3' + 16 = vector_size ! size designed for SIMD/OpenMP/GPU (number of events in lockstep) + #********************************************************************* # Customization (custom cuts/scale/bias/...) * # list of files containing fortran function that overwrite default * @@ -140,6 +156,14 @@ systematics = systematics_program ! none, systematics [python], SysCalc [depreceted, C++] ['--mur=0.5,1,2', '--muf=0.5,1,2', '--pdf=errorset'] = systematics_arguments ! see: https://cp3.irmp.ucl.ac.be/projects/madgraph/wiki/Systematics#Systematicspythonmodule +#********************************************************************* +# SIMD/GPU Parametrization +#********************************************************************* + d = floating_type ! single precision(f), double precision (d), mixed (m) [double for amplitude, single for color] + auto = avx_type ! for SIMD, technology to use for the vectorization + CPP = cudacpp_backend ! Fortran/CPP/CUDA switch mode to use + + #********************************************************************* # Options for the cudacpp plugin #********************************************************************* diff --git a/epochX/cudacpp/gg_tt.mad/Cards/run_card_default.dat b/epochX/cudacpp/gg_tt.mad/Cards/run_card_default.dat index 48d84c73f0..7697a3f769 100644 --- a/epochX/cudacpp/gg_tt.mad/Cards/run_card_default.dat +++ b/epochX/cudacpp/gg_tt.mad/Cards/run_card_default.dat @@ -80,7 +80,23 @@ 2 = sde_strategy ! default integration strategy (hep-ph/2021.00773) ! 1 is old strategy (using amp square) ! 2 is new strategy (using only the denominator) -# To see advanced option for Phase-Space optimization: type "update psoptim" +#********************************************************************* +# Phase-Space Optim (advanced) +#********************************************************************* + 0 = job_strategy ! see appendix of 1507.00020 (page 26) + 0 = hard_survey ! force to have better estimate of the integral at survey for difficult mode like interference + -1.0 = tmin_for_channel ! limit the non-singular reach of --some-- channel of integration related to T-channel diagram (value between -1 and 0), -1 is no impact + -1 = survey_splitting ! for loop-induced control how many core are used at survey for the computation of a single iteration. + 2 = survey_nchannel_per_job ! control how many Channel are integrated inside a single job on cluster/multicore + -1 = refine_evt_by_job ! control the maximal number of events for the first iteration of the refine (larger means less jobs) +#********************************************************************* +# Compilation flag. +#********************************************************************* + -O = global_flag ! fortran optimization flag use for the all code. + --fast-math = aloha_flag ! fortran optimization flag for aloha function. Suggestions: '-ffast-math' + -O3 = matrix_flag ! fortran optimization flag for matrix.f function. Suggestions: '-O3' + 16 = vector_size ! size designed for SIMD/OpenMP/GPU (number of events in lockstep) + #********************************************************************* # Customization (custom cuts/scale/bias/...) * # list of files containing fortran function that overwrite default * @@ -139,3 +155,11 @@ # systematics = systematics_program ! none, systematics [python], SysCalc [depreceted, C++] ['--mur=0.5,1,2', '--muf=0.5,1,2', '--pdf=errorset'] = systematics_arguments ! see: https://cp3.irmp.ucl.ac.be/projects/madgraph/wiki/Systematics#Systematicspythonmodule + +#********************************************************************* +# SIMD/GPU Parametrization +#********************************************************************* + d = floating_type ! single precision(f), double precision (d), mixed (m) [double for amplitude, single for color] + auto = avx_type ! for SIMD, technology to use for the vectorization + CPP = cudacpp_backend ! Fortran/CPP/CUDA switch mode to use + diff --git a/epochX/cudacpp/gg_tt.mad/Source/vector.inc b/epochX/cudacpp/gg_tt.mad/Source/vector.inc index 863eebbc70..3a54c8f26a 100644 --- a/epochX/cudacpp/gg_tt.mad/Source/vector.inc +++ b/epochX/cudacpp/gg_tt.mad/Source/vector.inc @@ -28,4 +28,4 @@ C BECAUSE IT DOES NOT GO THROUGH THE CPP PREPROCESSOR C (see https://github.com/madgraph5/madgraph4gpu/issues/458). C INTEGER VECSIZE_MEMMAX - PARAMETER (VECSIZE_MEMMAX=16384) + PARAMETER (VECSIZE_MEMMAX=16) diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/gg_tt.mad/bin/internal/launch_plugin.py index 0b849330ef..3527be0e2c 100644 --- a/epochX/cudacpp/gg_tt.mad/bin/internal/launch_plugin.py +++ b/epochX/cudacpp/gg_tt.mad/bin/internal/launch_plugin.py @@ -16,22 +16,28 @@ import internal.misc as misc import internal.extended_cmd as extended_cmd import internal.banner as banner_mod + import internal.common_run_interface as common_run_interface else: import madgraph.interface.madevent_interface as madevent_interface import madgraph.various.misc as misc import madgraph.interface.extended_cmd as extended_cmd import madgraph.various.banner as banner_mod + import madgraph.interface.common_run_interface as common_run_interface class CPPMEInterface(madevent_interface.MadEventCmdShell): def compile(self, *args, **opts): """ """ import multiprocessing if not self.options['nb_core'] or self.options['nb_core'] == 'None': - self.options['nb_core'] = multiprocessing.cpu_count() - if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'): - import pathlib - import os - pjoin = os.path.join + self.options['nb_core'] = multiprocessing.cpu_count() + if 'cwd' in opts and os.path.basename(opts['cwd']) == 'Source': + path = pjoin(opts['cwd'], 'make_opts') + avx_type = self.run_card['avx_type'] if self.run_card['avx_type'] != 'auto' else '' + common_run_interface.CommonRunCmd.update_make_opts_full(path, + {'FPTYPE': self.run_card['floating_type'], + 'AVX': avx_type }) + misc.sprint('FPTYPE checked') + if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'): cudacpp_backend = self.run_card['cudacpp_backend'].upper() # the default value is defined in banner.py logger.info("Building madevent in madevent_interface.py with '%s' matrix elements"%cudacpp_backend) if cudacpp_backend == 'FORTRAN': @@ -46,7 +52,22 @@ def compile(self, *args, **opts): else: return misc.compile(nb_core=self.options['nb_core'], *args, **opts) +# Phase-Space Optimization ------------------------------------------------------------------------------------ +template_on = \ +"""#********************************************************************* +# SIMD/GPU Parametrization +#********************************************************************* + %(floating_type)s = floating_type ! single precision(f), double precision (d), mixed (m) [double for amplitude, single for color] + %(avx_type)s = avx_type ! for SIMD, technology to use for the vectorization + %(cudacpp_backend)s = cudacpp_backend ! Fortran/CPP/CUDA switch mode to use +""" + +template_off = '' +plugin_block = banner_mod.RunBlock('simd', template_on=template_on, template_off=template_off) + class CPPRunCard(banner_mod.RunCardLO): + blocks = banner_mod.RunCardLO.blocks + [plugin_block] + def reset_simd(self, old_value, new_value, name): if not hasattr(self, 'path'): logger.warning('WARNING! CPPRunCard instance has no attribute path') @@ -58,18 +79,59 @@ def reset_simd(self, old_value, new_value, name): Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source') subprocess.call(['make', 'cleanavx'], cwd=Sourcedir, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) + def reset_makeopts(self, old_value, new_value, name): + if not hasattr(self, 'path'): + raise Exception + avx_value = self['avx_type'] if self['avx_type'] != 'auto' else '' + if name == 'floating_type': + common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': new_value, 'AVX': avx_value}) + elif name == 'avx_type': + if new_value == 'Auto': + new_value = '' + common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': self['floating_type'], 'AVX': new_value}) + else: + raise Exception + Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source') + subprocess.call(['make', 'cleanavx'], cwd=Sourcedir, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) + def plugin_input(self, finput): return def default_setup(self): super().default_setup() - self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False) + self.add_param('floating_type', 'd', include=False, hidden=False, + fct_mod=(self.reset_makeopts,(),{}), + allowed=['m','d','f']) + self.add_param('avx_type', 'auto', include=False, hidden=False, + fct_mod=(self.reset_makeopts,(),{}), + allowed=['auto', 'none', 'sse4', 'avx2','512y','512z']) + self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False, + allowed=['Fortan', 'CPP', 'CUDA']) + self['vector_size'] = 16 # already setup in default class (just change value) + self['aloha_flag'] = '--fast-math' + self['matrix_flag'] = '-O3' + self.display_block.append('simd') + self.display_block.append('psoptim') + # OM/AV - overload the default version in banner.py def write_one_include_file(self, output_dir, incname, output_file=None): """write one include file at the time""" - if incname == "vector.inc" and 'vector_size' not in self.user_set: - return - super().write_one_include_file(output_dir, incname, output_file) + if incname == "vector.inc": + if 'vector_size' not in self.user_set: return + if output_file is None: vectorinc=pjoin(output_dir,incname) + else: vectorinc=output_file + with open(vectorinc+'.new','w') as fileout: + with open(vectorinc) as filein: + for line in filein: + if line.startswith('C'): fileout.write(line) + super().write_one_include_file(output_dir, incname, output_file) + with open(vectorinc+'.new','a') as fileout: + with open(vectorinc) as filein: + for line in filein: + if not line.startswith('\n'): fileout.write(line) + os.replace(vectorinc+'.new',vectorinc) + else: + super().write_one_include_file(output_dir, incname, output_file) def check_validity(self): """ensure that PLUGIN information are consistent""" @@ -82,13 +144,10 @@ def check_validity(self): class GPURunCard(CPPRunCard): def default_setup(self): - super(CPPRunCard, self).default_setup() - self.add_param('cudacpp_backend', 'CUDA', include=False, hidden=False) - -#class CUDACPPRunCard(CPPRunCard): -# def default_setup(self): -# super(CPPRunCard, self).default_setup() -# self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False) + super().default_setup() + # change default value: + self['cudacpp_backend'] = 'CUDA' + self['vector_size'] = 16384 # already setup in default class (just change value) MEINTERFACE = CPPMEInterface RunCard = CPPRunCard From a96455b41018b1b70fec2cfafe906a7494645da5 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Wed, 15 Nov 2023 16:34:52 +0100 Subject: [PATCH 13/28] [floating_type_interface] in CODEGEN generateAndCompare.sh, modify the runcard to make sure that VECSIZE_MEMMAX is 16384 instead of 16 --- epochX/cudacpp/CODEGEN/generateAndCompare.sh | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/epochX/cudacpp/CODEGEN/generateAndCompare.sh b/epochX/cudacpp/CODEGEN/generateAndCompare.sh index 7519fbcc7d..ff02dcbb10 100755 --- a/epochX/cudacpp/CODEGEN/generateAndCompare.sh +++ b/epochX/cudacpp/CODEGEN/generateAndCompare.sh @@ -215,9 +215,12 @@ function codeGenAndDiff() if [ "${OUTBCK}" == "mad" ]; then # Force the use of strategy SDE=1 in multichannel mode (see #419) sed -i 's/2 = sde_strategy/1 = sde_strategy/' ${outproc}/Cards/run_card.dat + # Force the use of VECSIZE_MEMMAX=16384 + sed -i 's/16 = vector_size/16384 = vector_size/' ${outproc}/Cards/run_card.dat # Generate run_card.inc and param_card.inc (include stdout and stderr in the code generation log which is later checked for errors) # These two steps are part of "cd Source; make" but they actually are code-generating steps - ${outproc}/bin/madevent treatcards run >> ${outproc}_log.txt 2>&1 # AV BUG! THIS MAY SILENTLY FAIL (check if output contains "Please report this bug") + # Note: treatcards run also regenerates vector.inc if vector_size has changed in the runcard + ${outproc}/bin/madevent treatcards run >> ${outproc}_log.txt 2>&1 # AV BUG! THIS MAY SILENTLY FAIL (check if output contains "Please report this bug") ${outproc}/bin/madevent treatcards param >> ${outproc}_log.txt 2>&1 # AV BUG! THIS MAY SILENTLY FAIL (check if output contains "Please report this bug") # Cleanup \rm -f ${outproc}/crossx.html From ad54786f8e772052f5ca1e95a1deaf27f1ca5cbb Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Wed, 15 Nov 2023 16:36:33 +0100 Subject: [PATCH 14/28] [floating_type_interface] regenerate gg_tt.mad, now vector.inc (and runcard.dat) use 16384 instead of 16 Note also that 'WARNING! CPPRunCard instance has no attribute path' has disappeared from the logs. This is because vector.inc is no longer regenerated? --- .../cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt | 15 +++++++-------- epochX/cudacpp/gg_tt.mad/Cards/run_card.dat | 2 +- epochX/cudacpp/gg_tt.mad/Source/vector.inc | 2 +- 3 files changed, 9 insertions(+), 10 deletions(-) diff --git a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt index bae9fd6d79..31d174d109 100644 --- a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt +++ b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005484580993652344  +DEBUG: model prefixing takes 0.0054912567138671875  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -175,7 +175,7 @@ INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1 INFO: Processing color information for process: g g > t t~ @1 INFO: Creating files in directory P1_gg_ttx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -191,11 +191,11 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. INFO: Generating Feynman diagrams for Process: g g > t t~ WEIGHTED<=2 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttx Generated helas calls for 1 subprocesses (3 diagrams) in 0.006 s -Wrote files for 10 helas calls in 0.100 s +Wrote files for 10 helas calls in 0.101 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 2 routines in 0.142 s +ALOHA: aloha creates 2 routines in 0.144 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 @@ -237,9 +237,9 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m1.677s -user 0m1.472s -sys 0m0.204s +real 0m1.674s +user 0m1.441s +sys 0m0.228s ************************************************************ * * * W E L C O M E to * @@ -267,7 +267,6 @@ Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt No valid web browser found. Please set in ./input/mg5_configuration.txt treatcards run -WARNING! CPPRunCard instance has no attribute path quit INFO: launch in debug mode diff --git a/epochX/cudacpp/gg_tt.mad/Cards/run_card.dat b/epochX/cudacpp/gg_tt.mad/Cards/run_card.dat index 46d89bbc4f..9d601c6e36 100644 --- a/epochX/cudacpp/gg_tt.mad/Cards/run_card.dat +++ b/epochX/cudacpp/gg_tt.mad/Cards/run_card.dat @@ -95,7 +95,7 @@ -O = global_flag ! fortran optimization flag use for the all code. --fast-math = aloha_flag ! fortran optimization flag for aloha function. Suggestions: '-ffast-math' -O3 = matrix_flag ! fortran optimization flag for matrix.f function. Suggestions: '-O3' - 16 = vector_size ! size designed for SIMD/OpenMP/GPU (number of events in lockstep) + 16384 = vector_size ! size designed for SIMD/OpenMP/GPU (number of events in lockstep) #********************************************************************* # Customization (custom cuts/scale/bias/...) * diff --git a/epochX/cudacpp/gg_tt.mad/Source/vector.inc b/epochX/cudacpp/gg_tt.mad/Source/vector.inc index 3a54c8f26a..863eebbc70 100644 --- a/epochX/cudacpp/gg_tt.mad/Source/vector.inc +++ b/epochX/cudacpp/gg_tt.mad/Source/vector.inc @@ -28,4 +28,4 @@ C BECAUSE IT DOES NOT GO THROUGH THE CPP PREPROCESSOR C (see https://github.com/madgraph5/madgraph4gpu/issues/458). C INTEGER VECSIZE_MEMMAX - PARAMETER (VECSIZE_MEMMAX=16) + PARAMETER (VECSIZE_MEMMAX=16384) From b291e3bea0d09176ddfedbdaa0afe4c9b141e7c2 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Wed, 15 Nov 2023 17:33:36 +0100 Subject: [PATCH 15/28] [floating_type_interface] in CODEGEN generateAndCompare.sh, use --vector_size=32 to ensure that vector.inc is recreated when the runcard is modified to use 16384 Indeed, the warning now appears again "WARNING! CPPRunCard instance has no attribute path" when regenerating ggtt.mad --- epochX/cudacpp/CODEGEN/generateAndCompare.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/epochX/cudacpp/CODEGEN/generateAndCompare.sh b/epochX/cudacpp/CODEGEN/generateAndCompare.sh index ff02dcbb10..b790ca5081 100755 --- a/epochX/cudacpp/CODEGEN/generateAndCompare.sh +++ b/epochX/cudacpp/CODEGEN/generateAndCompare.sh @@ -152,7 +152,7 @@ function codeGenAndDiff() echo -e "\n+++ Generate code for '$proc'\n" ###exit 0 # FOR DEBUGGING # Vector size for mad/madonly meexporter (VECSIZE_MEMMAX) - vecsize=16384 # NB THIS IS NO LONGER IGNORED (but will eventually be tunable via runcards) + vecsize=32 # NB THIS IS NO LONGER IGNORED (but will eventually be tunable via runcards) # Generate code for the specific process pushd $MG5AMC_HOME >& /dev/null mkdir -p ../TMPOUT From bc93a0fd6357b75a9e8a17192bd76490f060da5f Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Wed, 15 Nov 2023 17:36:11 +0100 Subject: [PATCH 16/28] [floating_type_interface] regenerate gg_tt.mad, the warning now appears again "WARNING! CPPRunCard instance has no attribute path" --- .../gg_tt.mad/CODEGEN_mad_gg_tt_log.txt | 27 ++++++++++--------- .../cudacpp/gg_tt.mad/Cards/proc_card_mg5.dat | 2 +- epochX/cudacpp/gg_tt.mad/mg5.in | 2 +- 3 files changed, 16 insertions(+), 15 deletions(-) diff --git a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt index 31d174d109..75a4522df3 100644 --- a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt +++ b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.0054912567138671875  +DEBUG: model prefixing takes 0.0056798458099365234  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -157,7 +157,7 @@ INFO: Trying process: g g > t t~ WEIGHTED<=2 @1 INFO: Process has 3 diagrams 1 processes with 3 diagrams generated in 0.008 s Total: 1 processes with 3 diagrams -output madevent ../TMPOUT/CODEGEN_mad_gg_tt --hel_recycling=False --vector_size=16384 --me_exporter=standalone_cudacpp +output madevent ../TMPOUT/CODEGEN_mad_gg_tt --hel_recycling=False --vector_size=32 --me_exporter=standalone_cudacpp Load PLUGIN.CUDACPP_OUTPUT Addition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT Output will be done with PLUGIN: CUDACPP_OUTPUT @@ -175,7 +175,7 @@ INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1 INFO: Processing color information for process: g g > t t~ @1 INFO: Creating files in directory P1_gg_ttx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -184,23 +184,23 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: config_map =  [1, 2, 3] [export_cpp.py at line 711]  DEBUG: subproc_number =  0 [export_cpp.py at line 712]  DEBUG: Done [export_cpp.py at line 713]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872]  INFO: Generating Feynman diagrams for Process: g g > t t~ WEIGHTED<=2 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttx Generated helas calls for 1 subprocesses (3 diagrams) in 0.006 s -Wrote files for 10 helas calls in 0.101 s +Wrote files for 10 helas calls in 0.106 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 2 routines in 0.144 s +ALOHA: aloha creates 2 routines in 0.153 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 4 routines in 0.130 s +ALOHA: aloha creates 4 routines in 0.139 s VVV1 FFV1 FFV1 @@ -237,9 +237,9 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m1.674s -user 0m1.441s -sys 0m0.228s +real 0m1.874s +user 0m1.545s +sys 0m0.221s ************************************************************ * * * W E L C O M E to * @@ -267,6 +267,7 @@ Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt No valid web browser found. Please set in ./input/mg5_configuration.txt treatcards run +WARNING! CPPRunCard instance has no attribute path quit INFO: launch in debug mode diff --git a/epochX/cudacpp/gg_tt.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/gg_tt.mad/Cards/proc_card_mg5.dat index 4c14989a3f..cf111e2e6d 100644 --- a/epochX/cudacpp/gg_tt.mad/Cards/proc_card_mg5.dat +++ b/epochX/cudacpp/gg_tt.mad/Cards/proc_card_mg5.dat @@ -46,4 +46,4 @@ define l- = e- mu- define vl = ve vm vt define vl~ = ve~ vm~ vt~ output madevent ../TMPOUT/CODEGEN_mad_gg_tt --hel_recycling=False --ve\ -ctor_size=16384 --me_exporter=standalone_cudacpp +ctor_size=32 --me_exporter=standalone_cudacpp diff --git a/epochX/cudacpp/gg_tt.mad/mg5.in b/epochX/cudacpp/gg_tt.mad/mg5.in index 7859bf9b80..b4b356fc51 100644 --- a/epochX/cudacpp/gg_tt.mad/mg5.in +++ b/epochX/cudacpp/gg_tt.mad/mg5.in @@ -1,4 +1,4 @@ set stdout_level DEBUG set zerowidth_tchannel F generate g g > t t~ -output madevent gg_tt.mad --hel_recycling=False --vector_size=16384 --me_exporter=standalone_cudacpp +output madevent gg_tt.mad --hel_recycling=False --vector_size=32 --me_exporter=standalone_cudacpp From 9eb8b4820eaef37e180d485ea1d37a619b1aa631 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Wed, 15 Nov 2023 17:38:48 +0100 Subject: [PATCH 17/28] [floating_type_interface] upgrade MG5aMC/mg5amcnlo from d8c1613cc to 7b8964e28 including Olivier's patch for warning #790 --- MG5aMC/mg5amcnlo | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MG5aMC/mg5amcnlo b/MG5aMC/mg5amcnlo index d8c1613ccf..7b8964e28c 160000 --- a/MG5aMC/mg5amcnlo +++ b/MG5aMC/mg5amcnlo @@ -1 +1 @@ -Subproject commit d8c1613ccf638b5b078a64379e385def5649622c +Subproject commit 7b8964e28c7ea658d11a5224485e7fd918d7a63a From ca4c1628e22806b39cb7ac948898e0e0449c8658 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Wed, 15 Nov 2023 17:40:23 +0100 Subject: [PATCH 18/28] [floating_type_interface] regenerate gg_tt.mad, the warning now has disappered thanks to Olivier's fix for #790 --- .../gg_tt.mad/CODEGEN_mad_gg_tt_log.txt | 18 +++++++++--------- .../cudacpp/gg_tt.mad/bin/internal/banner.py | 4 +++- .../bin/internal/madevent_interface.py | 2 +- 3 files changed, 13 insertions(+), 11 deletions(-) diff --git a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt index 75a4522df3..8377ba9a6a 100644 --- a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt +++ b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt @@ -2,6 +2,7 @@ This version is intended for development/beta testing and NOT for production. This version has not been fully tested (if at all) and might have limited user support (if at all) Running MG5 in debug mode +('WARNING: loading of madgraph too slow!!!', 0.515019416809082) ************************************************************ * * * W E L C O M E to * @@ -62,7 +63,7 @@ generate g g > t t~ No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.0056798458099365234  +DEBUG: model prefixing takes 0.005373716354370117  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -175,7 +176,7 @@ INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1 INFO: Processing color information for process: g g > t t~ @1 INFO: Creating files in directory P1_gg_ttx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -191,16 +192,16 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. INFO: Generating Feynman diagrams for Process: g g > t t~ WEIGHTED<=2 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttx Generated helas calls for 1 subprocesses (3 diagrams) in 0.006 s -Wrote files for 10 helas calls in 0.106 s +Wrote files for 10 helas calls in 0.100 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 2 routines in 0.153 s +ALOHA: aloha creates 2 routines in 0.143 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 4 routines in 0.139 s +ALOHA: aloha creates 4 routines in 0.131 s VVV1 FFV1 FFV1 @@ -237,9 +238,9 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m1.874s -user 0m1.545s -sys 0m0.221s +real 0m2.117s +user 0m1.839s +sys 0m0.239s ************************************************************ * * * W E L C O M E to * @@ -267,7 +268,6 @@ Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt No valid web browser found. Please set in ./input/mg5_configuration.txt treatcards run -WARNING! CPPRunCard instance has no attribute path quit INFO: launch in debug mode diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/banner.py b/epochX/cudacpp/gg_tt.mad/bin/internal/banner.py index 3995ce8109..651175405f 100755 --- a/epochX/cudacpp/gg_tt.mad/bin/internal/banner.py +++ b/epochX/cudacpp/gg_tt.mad/bin/internal/banner.py @@ -2822,13 +2822,15 @@ def add_param(self, name, value, fortran_name=None, include=True, if fct_mod: self.fct_mod[name] = fct_mod - def read(self, finput, consistency=True, unknown_warning=True): + def read(self, finput, consistency=True, unknown_warning=True, **opt): """Read the input file, this can be a path to a file, a file object, a str with the content of the file.""" if isinstance(finput, str): if "\n" in finput: finput = finput.split('\n') + if 'path' in opt: + self.path = opt['path'] elif os.path.isfile(finput): self.path = finput finput = open(finput) diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/madevent_interface.py b/epochX/cudacpp/gg_tt.mad/bin/internal/madevent_interface.py index 853aabc98a..cb6bf4ca57 100755 --- a/epochX/cudacpp/gg_tt.mad/bin/internal/madevent_interface.py +++ b/epochX/cudacpp/gg_tt.mad/bin/internal/madevent_interface.py @@ -3191,7 +3191,7 @@ def do_treatcards(self, line, mode=None, opt=None): if mode in ['run', 'all']: if not hasattr(self, 'run_card'): - run_card = banner_mod.RunCard(opt['run_card']) + run_card = banner_mod.RunCard(opt['run_card'], path=pjoin(self.me_dir, 'Cards', 'run_card.dat')) else: run_card = self.run_card self.run_card = run_card From 8422d19748232930d634e95547c0bb6b28226ece Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Wed, 15 Nov 2023 17:42:59 +0100 Subject: [PATCH 19/28] [floating_type_interface] in CODEGEN launch_plugin.py, remove the workaround for #790, now fixed in mg5amcnlo --- .../cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/launch_plugin.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/launch_plugin.py b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/launch_plugin.py index 3527be0e2c..797560fcce 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/launch_plugin.py +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/launch_plugin.py @@ -70,9 +70,7 @@ class CPPRunCard(banner_mod.RunCardLO): def reset_simd(self, old_value, new_value, name): if not hasattr(self, 'path'): - logger.warning('WARNING! CPPRunCard instance has no attribute path') - return - ###raise Exception('INTERNAL ERROR! CPPRunCard instance has no attribute path') + raise Exception('INTERNAL ERROR! CPPRunCard instance has no attribute path') # now ok after fixing #790 if name == "vector_size" and new_value <= int(old_value): # code can handle the new size -> do not recompile return From 1045a420891c3035c3a78e7b6bee896f9caf3e17 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Wed, 15 Nov 2023 17:44:05 +0100 Subject: [PATCH 20/28] [floating_type_interface] regenerate gg_tt.mad, there is no exception thrown after fixing #790 and removing the workaround --- .../cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt | 15 +++++++-------- .../gg_tt.mad/bin/internal/launch_plugin.py | 4 +--- 2 files changed, 8 insertions(+), 11 deletions(-) diff --git a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt index 8377ba9a6a..4cb42aacca 100644 --- a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt +++ b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt @@ -2,7 +2,6 @@ This version is intended for development/beta testing and NOT for production. This version has not been fully tested (if at all) and might have limited user support (if at all) Running MG5 in debug mode -('WARNING: loading of madgraph too slow!!!', 0.515019416809082) ************************************************************ * * * W E L C O M E to * @@ -63,7 +62,7 @@ generate g g > t t~ No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005373716354370117  +DEBUG: model prefixing takes 0.005273580551147461  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -176,7 +175,7 @@ INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1 INFO: Processing color information for process: g g > t t~ @1 INFO: Creating files in directory P1_gg_ttx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -196,12 +195,12 @@ Wrote files for 10 helas calls in 0.100 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 2 routines in 0.143 s +ALOHA: aloha creates 2 routines in 0.144 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 4 routines in 0.131 s +ALOHA: aloha creates 4 routines in 0.132 s VVV1 FFV1 FFV1 @@ -238,9 +237,9 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m2.117s -user 0m1.839s -sys 0m0.239s +real 0m1.719s +user 0m1.469s +sys 0m0.213s ************************************************************ * * * W E L C O M E to * diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/gg_tt.mad/bin/internal/launch_plugin.py index 3527be0e2c..797560fcce 100644 --- a/epochX/cudacpp/gg_tt.mad/bin/internal/launch_plugin.py +++ b/epochX/cudacpp/gg_tt.mad/bin/internal/launch_plugin.py @@ -70,9 +70,7 @@ class CPPRunCard(banner_mod.RunCardLO): def reset_simd(self, old_value, new_value, name): if not hasattr(self, 'path'): - logger.warning('WARNING! CPPRunCard instance has no attribute path') - return - ###raise Exception('INTERNAL ERROR! CPPRunCard instance has no attribute path') + raise Exception('INTERNAL ERROR! CPPRunCard instance has no attribute path') # now ok after fixing #790 if name == "vector_size" and new_value <= int(old_value): # code can handle the new size -> do not recompile return From 72f12a152f77df51fb07409639e21072cb6ac4de Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Wed, 15 Nov 2023 17:54:50 +0100 Subject: [PATCH 21/28] [floating_type_interface] in CODEGEN launch_plugin.py, improve cudacpp runcard comments and indentation --- .../PLUGIN/CUDACPP_SA_OUTPUT/launch_plugin.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/launch_plugin.py b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/launch_plugin.py index 797560fcce..cfec10d9d8 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/launch_plugin.py +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/launch_plugin.py @@ -54,12 +54,12 @@ def compile(self, *args, **opts): # Phase-Space Optimization ------------------------------------------------------------------------------------ template_on = \ -"""#********************************************************************* -# SIMD/GPU Parametrization -#********************************************************************* - %(floating_type)s = floating_type ! single precision(f), double precision (d), mixed (m) [double for amplitude, single for color] - %(avx_type)s = avx_type ! for SIMD, technology to use for the vectorization - %(cudacpp_backend)s = cudacpp_backend ! Fortran/CPP/CUDA switch mode to use +"""#*********************************************************************** +# SIMD/GPU configuration for the CUDACPP plugin +#************************************************************************ + %(floating_type)s = floating_type ! floating point precision: f (single), d (double), m (mixed: double for amplitudes, single for colors) + %(avx_type)s = avx_type ! SIMD vectorization level: none, sse4, avx2, 512y, 512z + %(cudacpp_backend)s = cudacpp_backend ! CUDACPP backend: FORTRAN, CPP, CUDA """ template_off = '' From 843442c280c161c5e4d65c87722714d5c7eb677c Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Wed, 15 Nov 2023 17:57:23 +0100 Subject: [PATCH 22/28] [floating_type_interface] in CODEGEN generateAndCompare.sh, do not add cudacpp_backend at the end of runcards, as it is now already there by default --- epochX/cudacpp/CODEGEN/generateAndCompare.sh | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/epochX/cudacpp/CODEGEN/generateAndCompare.sh b/epochX/cudacpp/CODEGEN/generateAndCompare.sh index b790ca5081..99bf044767 100755 --- a/epochX/cudacpp/CODEGEN/generateAndCompare.sh +++ b/epochX/cudacpp/CODEGEN/generateAndCompare.sh @@ -251,10 +251,7 @@ function codeGenAndDiff() #********************************************************************* # Set cudacpp-specific values of non-cudacpp-specific options --O3 -ffast-math -fbounds-check = global_flag ! build flags for Fortran code (for a fair comparison to cudacpp) - -# New cudacpp-specific options (default values are defined in banner.py) -CPP = cudacpp_backend ! valid backends are FORTRAN, CPP, CUDA" >> ${outproc}/Cards/run_card.dat +-O3 -ffast-math -fbounds-check = global_flag ! build flags for Fortran code (for a fair comparison to cudacpp)" >> ${outproc}/Cards/run_card.dat fi fi popd >& /dev/null From 20364f5b16dff94fa24ea2e6dfc7b145cdd992a9 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Wed, 15 Nov 2023 18:11:28 +0100 Subject: [PATCH 23/28] [floating_type_interface] upgrade MG5aMC/mg5amcnlo to include my minor changes to runcard comments --- MG5aMC/mg5amcnlo | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MG5aMC/mg5amcnlo b/MG5aMC/mg5amcnlo index 7b8964e28c..23f61b93fd 160000 --- a/MG5aMC/mg5amcnlo +++ b/MG5aMC/mg5amcnlo @@ -1 +1 @@ -Subproject commit 7b8964e28c7ea658d11a5224485e7fd918d7a63a +Subproject commit 23f61b93fdf268a1cdcbd363cd449c88b3511d7a From e4eac91e8e10d2df390c159b82ec0d7a438cdfdf Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Wed, 15 Nov 2023 18:12:12 +0100 Subject: [PATCH 24/28] [floating_type_interface] in CODEGEN generateAndCompare.sh, modify the existing global_flag in runcards, instead of adding a new line at the end --- epochX/cudacpp/CODEGEN/generateAndCompare.sh | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/epochX/cudacpp/CODEGEN/generateAndCompare.sh b/epochX/cudacpp/CODEGEN/generateAndCompare.sh index 99bf044767..91341dbc4f 100755 --- a/epochX/cudacpp/CODEGEN/generateAndCompare.sh +++ b/epochX/cudacpp/CODEGEN/generateAndCompare.sh @@ -217,6 +217,8 @@ function codeGenAndDiff() sed -i 's/2 = sde_strategy/1 = sde_strategy/' ${outproc}/Cards/run_card.dat # Force the use of VECSIZE_MEMMAX=16384 sed -i 's/16 = vector_size/16384 = vector_size/' ${outproc}/Cards/run_card.dat + # Force the use of fast-math in Fortran builds + sed -i 's/-O = global_flag.*/-O3 -ffast-math -fbounds-check = global_flag ! build flags for all Fortran code (for a fair comparison to cudacpp; default is -O)/' ${outproc}/Cards/run_card.dat # Generate run_card.inc and param_card.inc (include stdout and stderr in the code generation log which is later checked for errors) # These two steps are part of "cd Source; make" but they actually are code-generating steps # Note: treatcards run also regenerates vector.inc if vector_size has changed in the runcard @@ -245,13 +247,6 @@ function codeGenAndDiff() cat ${outproc}/Source/make_opts | sed '/#end/q' | sort > ${outproc}/Source/make_opts.new cat ${outproc}/Source/make_opts | sed -n -e '/#end/,$p' >> ${outproc}/Source/make_opts.new \mv ${outproc}/Source/make_opts.new ${outproc}/Source/make_opts - echo " -#********************************************************************* -# Options for the cudacpp plugin -#********************************************************************* - -# Set cudacpp-specific values of non-cudacpp-specific options --O3 -ffast-math -fbounds-check = global_flag ! build flags for Fortran code (for a fair comparison to cudacpp)" >> ${outproc}/Cards/run_card.dat fi fi popd >& /dev/null From 01a2bfa3011b8b6932d3bb65e8bc8aecd10bf5fd Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Wed, 15 Nov 2023 18:28:25 +0100 Subject: [PATCH 25/28] [floating_type_interface] in CODEGEN generateAndCompare.sh, minor fix to avoid duplicate '#end_of_make_opts_variables' in make_opts Also add a comment about GLOBAL_FLAG in make_opts (why is this not propagated from the runcards?) --- epochX/cudacpp/CODEGEN/generateAndCompare.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/epochX/cudacpp/CODEGEN/generateAndCompare.sh b/epochX/cudacpp/CODEGEN/generateAndCompare.sh index 91341dbc4f..f3f830205c 100755 --- a/epochX/cudacpp/CODEGEN/generateAndCompare.sh +++ b/epochX/cudacpp/CODEGEN/generateAndCompare.sh @@ -238,13 +238,14 @@ function codeGenAndDiff() touch ${outproc}/HTML/.keep # new file if [ "${patchlevel}" != "0" ]; then # Add global flag '-O3 -ffast-math -fbounds-check' as in previous gridpacks + # (FIXME? these flags are already set in the runcards, why are they not propagated to make_opts?) echo "GLOBAL_FLAG=-O3 -ffast-math -fbounds-check" > ${outproc}/Source/make_opts.new cat ${outproc}/Source/make_opts >> ${outproc}/Source/make_opts.new \mv ${outproc}/Source/make_opts.new ${outproc}/Source/make_opts fi if [ "${patchlevel}" == "2" ]; then sed -i 's/DEFAULT_F2PY_COMPILER=f2py.*/DEFAULT_F2PY_COMPILER=f2py3/' ${outproc}/Source/make_opts - cat ${outproc}/Source/make_opts | sed '/#end/q' | sort > ${outproc}/Source/make_opts.new + cat ${outproc}/Source/make_opts | sed '/#end/q' | head --lines=-1 | sort > ${outproc}/Source/make_opts.new cat ${outproc}/Source/make_opts | sed -n -e '/#end/,$p' >> ${outproc}/Source/make_opts.new \mv ${outproc}/Source/make_opts.new ${outproc}/Source/make_opts fi From 5fc54dc840835485102467b2362e46f0aebef420 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Wed, 15 Nov 2023 18:47:25 +0100 Subject: [PATCH 26/28] [floating_type_interface] in CODEGEN launch_plugin.py, rename avx_type as avx_level in the runcard (and fix a minor bug, replace Auto by auto) --- .../PLUGIN/CUDACPP_SA_OUTPUT/launch_plugin.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/launch_plugin.py b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/launch_plugin.py index cfec10d9d8..3b09713e12 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/launch_plugin.py +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/launch_plugin.py @@ -32,10 +32,10 @@ def compile(self, *args, **opts): self.options['nb_core'] = multiprocessing.cpu_count() if 'cwd' in opts and os.path.basename(opts['cwd']) == 'Source': path = pjoin(opts['cwd'], 'make_opts') - avx_type = self.run_card['avx_type'] if self.run_card['avx_type'] != 'auto' else '' + avx_level = self.run_card['avx_level'] if self.run_card['avx_level'] != 'auto' else '' common_run_interface.CommonRunCmd.update_make_opts_full(path, {'FPTYPE': self.run_card['floating_type'], - 'AVX': avx_type }) + 'AVX': avx_level }) misc.sprint('FPTYPE checked') if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'): cudacpp_backend = self.run_card['cudacpp_backend'].upper() # the default value is defined in banner.py @@ -58,7 +58,7 @@ def compile(self, *args, **opts): # SIMD/GPU configuration for the CUDACPP plugin #************************************************************************ %(floating_type)s = floating_type ! floating point precision: f (single), d (double), m (mixed: double for amplitudes, single for colors) - %(avx_type)s = avx_type ! SIMD vectorization level: none, sse4, avx2, 512y, 512z + %(avx_level)s = avx_level ! SIMD vectorization level: none, sse4, avx2, 512y, 512z, auto %(cudacpp_backend)s = cudacpp_backend ! CUDACPP backend: FORTRAN, CPP, CUDA """ @@ -80,12 +80,11 @@ def reset_simd(self, old_value, new_value, name): def reset_makeopts(self, old_value, new_value, name): if not hasattr(self, 'path'): raise Exception - avx_value = self['avx_type'] if self['avx_type'] != 'auto' else '' + avx_value = self['avx_level'] if self['avx_level'] != 'auto' else '' if name == 'floating_type': common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': new_value, 'AVX': avx_value}) - elif name == 'avx_type': - if new_value == 'Auto': - new_value = '' + elif name == 'avx_level': + if new_value == 'auto': new_value = '' common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': self['floating_type'], 'AVX': new_value}) else: raise Exception @@ -100,7 +99,7 @@ def default_setup(self): self.add_param('floating_type', 'd', include=False, hidden=False, fct_mod=(self.reset_makeopts,(),{}), allowed=['m','d','f']) - self.add_param('avx_type', 'auto', include=False, hidden=False, + self.add_param('avx_level', 'auto', include=False, hidden=False, fct_mod=(self.reset_makeopts,(),{}), allowed=['auto', 'none', 'sse4', 'avx2','512y','512z']) self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False, From 795ae3752e88f8d1307d2e1f7d7ed25ee445225e Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Wed, 15 Nov 2023 18:48:51 +0100 Subject: [PATCH 27/28] [floating_type_interface] regenerate gg_tt.mad, all looks ok (only make_opts changes but only a comment is removed in there) --- .../gg_tt.mad/CODEGEN_mad_gg_tt_log.txt | 14 +++++----- epochX/cudacpp/gg_tt.mad/Cards/run_card.dat | 26 ++++++------------- .../gg_tt.mad/Cards/run_card_default.dat | 14 +++++----- epochX/cudacpp/gg_tt.mad/Source/make_opts | 1 - .../cudacpp/gg_tt.mad/bin/internal/banner.py | 2 +- .../gg_tt.mad/bin/internal/launch_plugin.py | 25 +++++++++--------- 6 files changed, 35 insertions(+), 47 deletions(-) diff --git a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt index 4cb42aacca..9082dbc054 100644 --- a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt +++ b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005273580551147461  +DEBUG: model prefixing takes 0.00536799430847168  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -175,7 +175,7 @@ INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1 INFO: Processing color information for process: g g > t t~ @1 INFO: Creating files in directory P1_gg_ttx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -195,12 +195,12 @@ Wrote files for 10 helas calls in 0.100 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 2 routines in 0.144 s +ALOHA: aloha creates 2 routines in 0.143 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 4 routines in 0.132 s +ALOHA: aloha creates 4 routines in 0.130 s VVV1 FFV1 FFV1 @@ -237,9 +237,9 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m1.719s -user 0m1.469s -sys 0m0.213s +real 0m1.673s +user 0m1.458s +sys 0m0.207s ************************************************************ * * * W E L C O M E to * diff --git a/epochX/cudacpp/gg_tt.mad/Cards/run_card.dat b/epochX/cudacpp/gg_tt.mad/Cards/run_card.dat index 9d601c6e36..a6463dc262 100644 --- a/epochX/cudacpp/gg_tt.mad/Cards/run_card.dat +++ b/epochX/cudacpp/gg_tt.mad/Cards/run_card.dat @@ -92,10 +92,10 @@ #********************************************************************* # Compilation flag. #********************************************************************* - -O = global_flag ! fortran optimization flag use for the all code. + -O3 -ffast-math -fbounds-check = global_flag ! build flags for all Fortran code (for a fair comparison to cudacpp; default is -O) --fast-math = aloha_flag ! fortran optimization flag for aloha function. Suggestions: '-ffast-math' -O3 = matrix_flag ! fortran optimization flag for matrix.f function. Suggestions: '-O3' - 16384 = vector_size ! size designed for SIMD/OpenMP/GPU (number of events in lockstep) + 16384 = vector_size ! size of fortran arrays allocated in the multi-event API for SIMD/GPU (VECSIZE_MEMMAX) #********************************************************************* # Customization (custom cuts/scale/bias/...) * @@ -156,20 +156,10 @@ systematics = systematics_program ! none, systematics [python], SysCalc [depreceted, C++] ['--mur=0.5,1,2', '--muf=0.5,1,2', '--pdf=errorset'] = systematics_arguments ! see: https://cp3.irmp.ucl.ac.be/projects/madgraph/wiki/Systematics#Systematicspythonmodule -#********************************************************************* -# SIMD/GPU Parametrization -#********************************************************************* - d = floating_type ! single precision(f), double precision (d), mixed (m) [double for amplitude, single for color] - auto = avx_type ! for SIMD, technology to use for the vectorization - CPP = cudacpp_backend ! Fortran/CPP/CUDA switch mode to use - - -#********************************************************************* -# Options for the cudacpp plugin -#********************************************************************* - -# Set cudacpp-specific values of non-cudacpp-specific options --O3 -ffast-math -fbounds-check = global_flag ! build flags for Fortran code (for a fair comparison to cudacpp) +#*********************************************************************** +# SIMD/GPU configuration for the CUDACPP plugin +#************************************************************************ + d = floating_type ! floating point precision: f (single), d (double), m (mixed: double for amplitudes, single for colors) + auto = avx_level ! SIMD vectorization level: none, sse4, avx2, 512y, 512z, auto + CPP = cudacpp_backend ! CUDACPP backend: FORTRAN, CPP, CUDA -# New cudacpp-specific options (default values are defined in banner.py) -CPP = cudacpp_backend ! valid backends are FORTRAN, CPP, CUDA diff --git a/epochX/cudacpp/gg_tt.mad/Cards/run_card_default.dat b/epochX/cudacpp/gg_tt.mad/Cards/run_card_default.dat index 7697a3f769..27e990a016 100644 --- a/epochX/cudacpp/gg_tt.mad/Cards/run_card_default.dat +++ b/epochX/cudacpp/gg_tt.mad/Cards/run_card_default.dat @@ -95,7 +95,7 @@ -O = global_flag ! fortran optimization flag use for the all code. --fast-math = aloha_flag ! fortran optimization flag for aloha function. Suggestions: '-ffast-math' -O3 = matrix_flag ! fortran optimization flag for matrix.f function. Suggestions: '-O3' - 16 = vector_size ! size designed for SIMD/OpenMP/GPU (number of events in lockstep) + 16 = vector_size ! size of fortran arrays allocated in the multi-event API for SIMD/GPU (VECSIZE_MEMMAX) #********************************************************************* # Customization (custom cuts/scale/bias/...) * @@ -156,10 +156,10 @@ systematics = systematics_program ! none, systematics [python], SysCalc [depreceted, C++] ['--mur=0.5,1,2', '--muf=0.5,1,2', '--pdf=errorset'] = systematics_arguments ! see: https://cp3.irmp.ucl.ac.be/projects/madgraph/wiki/Systematics#Systematicspythonmodule -#********************************************************************* -# SIMD/GPU Parametrization -#********************************************************************* - d = floating_type ! single precision(f), double precision (d), mixed (m) [double for amplitude, single for color] - auto = avx_type ! for SIMD, technology to use for the vectorization - CPP = cudacpp_backend ! Fortran/CPP/CUDA switch mode to use +#*********************************************************************** +# SIMD/GPU configuration for the CUDACPP plugin +#************************************************************************ + d = floating_type ! floating point precision: f (single), d (double), m (mixed: double for amplitudes, single for colors) + auto = avx_level ! SIMD vectorization level: none, sse4, avx2, 512y, 512z, auto + CPP = cudacpp_backend ! CUDACPP backend: FORTRAN, CPP, CUDA diff --git a/epochX/cudacpp/gg_tt.mad/Source/make_opts b/epochX/cudacpp/gg_tt.mad/Source/make_opts index 6409e99a49..e4b87ee6ad 100644 --- a/epochX/cudacpp/gg_tt.mad/Source/make_opts +++ b/epochX/cudacpp/gg_tt.mad/Source/make_opts @@ -1,7 +1,6 @@ DEFAULT_CPP_COMPILER=g++ DEFAULT_F2PY_COMPILER=f2py3 DEFAULT_F_COMPILER=gfortran -#end_of_make_opts_variables GLOBAL_FLAG=-O3 -ffast-math -fbounds-check MACFLAG= MG5AMC_VERSION=SpecifiedByMG5aMCAtRunTime diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/banner.py b/epochX/cudacpp/gg_tt.mad/bin/internal/banner.py index 651175405f..bd1517985f 100755 --- a/epochX/cudacpp/gg_tt.mad/bin/internal/banner.py +++ b/epochX/cudacpp/gg_tt.mad/bin/internal/banner.py @@ -3915,7 +3915,7 @@ def remove_all_cut(self): %(global_flag)s = global_flag ! fortran optimization flag use for the all code. %(aloha_flag)s = aloha_flag ! fortran optimization flag for aloha function. Suggestions: '-ffast-math' %(matrix_flag)s = matrix_flag ! fortran optimization flag for matrix.f function. Suggestions: '-O3' - %(vector_size)s = vector_size ! size designed for SIMD/OpenMP/GPU (number of events in lockstep) + %(vector_size)s = vector_size ! size of fortran arrays allocated in the multi-event API for SIMD/GPU (VECSIZE_MEMMAX) """ template_off = '# To see advanced option for Phase-Space optimization: type "update psoptim"' diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/gg_tt.mad/bin/internal/launch_plugin.py index 797560fcce..3b09713e12 100644 --- a/epochX/cudacpp/gg_tt.mad/bin/internal/launch_plugin.py +++ b/epochX/cudacpp/gg_tt.mad/bin/internal/launch_plugin.py @@ -32,10 +32,10 @@ def compile(self, *args, **opts): self.options['nb_core'] = multiprocessing.cpu_count() if 'cwd' in opts and os.path.basename(opts['cwd']) == 'Source': path = pjoin(opts['cwd'], 'make_opts') - avx_type = self.run_card['avx_type'] if self.run_card['avx_type'] != 'auto' else '' + avx_level = self.run_card['avx_level'] if self.run_card['avx_level'] != 'auto' else '' common_run_interface.CommonRunCmd.update_make_opts_full(path, {'FPTYPE': self.run_card['floating_type'], - 'AVX': avx_type }) + 'AVX': avx_level }) misc.sprint('FPTYPE checked') if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'): cudacpp_backend = self.run_card['cudacpp_backend'].upper() # the default value is defined in banner.py @@ -54,12 +54,12 @@ def compile(self, *args, **opts): # Phase-Space Optimization ------------------------------------------------------------------------------------ template_on = \ -"""#********************************************************************* -# SIMD/GPU Parametrization -#********************************************************************* - %(floating_type)s = floating_type ! single precision(f), double precision (d), mixed (m) [double for amplitude, single for color] - %(avx_type)s = avx_type ! for SIMD, technology to use for the vectorization - %(cudacpp_backend)s = cudacpp_backend ! Fortran/CPP/CUDA switch mode to use +"""#*********************************************************************** +# SIMD/GPU configuration for the CUDACPP plugin +#************************************************************************ + %(floating_type)s = floating_type ! floating point precision: f (single), d (double), m (mixed: double for amplitudes, single for colors) + %(avx_level)s = avx_level ! SIMD vectorization level: none, sse4, avx2, 512y, 512z, auto + %(cudacpp_backend)s = cudacpp_backend ! CUDACPP backend: FORTRAN, CPP, CUDA """ template_off = '' @@ -80,12 +80,11 @@ def reset_simd(self, old_value, new_value, name): def reset_makeopts(self, old_value, new_value, name): if not hasattr(self, 'path'): raise Exception - avx_value = self['avx_type'] if self['avx_type'] != 'auto' else '' + avx_value = self['avx_level'] if self['avx_level'] != 'auto' else '' if name == 'floating_type': common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': new_value, 'AVX': avx_value}) - elif name == 'avx_type': - if new_value == 'Auto': - new_value = '' + elif name == 'avx_level': + if new_value == 'auto': new_value = '' common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': self['floating_type'], 'AVX': new_value}) else: raise Exception @@ -100,7 +99,7 @@ def default_setup(self): self.add_param('floating_type', 'd', include=False, hidden=False, fct_mod=(self.reset_makeopts,(),{}), allowed=['m','d','f']) - self.add_param('avx_type', 'auto', include=False, hidden=False, + self.add_param('avx_level', 'auto', include=False, hidden=False, fct_mod=(self.reset_makeopts,(),{}), allowed=['auto', 'none', 'sse4', 'avx2','512y','512z']) self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False, From 4c912508e57892729d2a57ddb95a2b78b37447ed Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Wed, 15 Nov 2023 18:54:52 +0100 Subject: [PATCH 28/28] [floating_type_interface] ** COMPLETE floating_type_interface ** regenerate all 15 processes No need to rerun the tests, there are no real changes in the code with respect to upstream/master --- .../ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt | 39 ++-- .../ee_mumu.mad/Cards/proc_card_mg5.dat | 2 +- epochX/cudacpp/ee_mumu.mad/Cards/run_card.dat | 32 ++- .../ee_mumu.mad/Cards/run_card_default.dat | 26 ++- epochX/cudacpp/ee_mumu.mad/Source/make_opts | 1 - .../ee_mumu.mad/bin/internal/banner.py | 6 +- .../ee_mumu.mad/bin/internal/launch_plugin.py | 94 ++++++-- .../bin/internal/madevent_interface.py | 2 +- epochX/cudacpp/ee_mumu.mad/mg5.in | 2 +- .../CODEGEN_cudacpp_ee_mumu_log.txt | 29 ++- .../gg_tt.mad/CODEGEN_mad_gg_tt_log.txt | 16 +- .../gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt | 25 +- .../gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt | 48 ++-- .../gg_tt01g.mad/Cards/proc_card_mg5.dat | 2 +- .../cudacpp/gg_tt01g.mad/Cards/run_card.dat | 32 ++- .../gg_tt01g.mad/Cards/run_card_default.dat | 26 ++- epochX/cudacpp/gg_tt01g.mad/Source/make_opts | 1 - .../gg_tt01g.mad/bin/internal/banner.py | 6 +- .../bin/internal/launch_plugin.py | 94 ++++++-- .../bin/internal/madevent_interface.py | 2 +- epochX/cudacpp/gg_tt01g.mad/mg5.in | 2 +- .../gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt | 40 ++-- .../gg_ttg.mad/Cards/proc_card_mg5.dat | 2 +- epochX/cudacpp/gg_ttg.mad/Cards/run_card.dat | 32 ++- .../gg_ttg.mad/Cards/run_card_default.dat | 26 ++- epochX/cudacpp/gg_ttg.mad/Source/make_opts | 1 - .../cudacpp/gg_ttg.mad/bin/internal/banner.py | 6 +- .../gg_ttg.mad/bin/internal/launch_plugin.py | 94 ++++++-- .../bin/internal/madevent_interface.py | 2 +- epochX/cudacpp/gg_ttg.mad/mg5.in | 2 +- .../gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt | 29 ++- .../gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt | 40 ++-- .../gg_ttgg.mad/Cards/proc_card_mg5.dat | 2 +- epochX/cudacpp/gg_ttgg.mad/Cards/run_card.dat | 32 ++- .../gg_ttgg.mad/Cards/run_card_default.dat | 26 ++- epochX/cudacpp/gg_ttgg.mad/Source/make_opts | 1 - .../gg_ttgg.mad/bin/internal/banner.py | 6 +- .../gg_ttgg.mad/bin/internal/launch_plugin.py | 94 ++++++-- .../bin/internal/madevent_interface.py | 2 +- epochX/cudacpp/gg_ttgg.mad/mg5.in | 2 +- .../CODEGEN_cudacpp_gg_ttgg_log.txt | 25 +- .../gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt | 44 ++-- .../gg_ttggg.mad/Cards/proc_card_mg5.dat | 2 +- .../cudacpp/gg_ttggg.mad/Cards/run_card.dat | 32 ++- .../gg_ttggg.mad/Cards/run_card_default.dat | 26 ++- epochX/cudacpp/gg_ttggg.mad/Source/make_opts | 1 - .../gg_ttggg.mad/bin/internal/banner.py | 6 +- .../bin/internal/launch_plugin.py | 94 ++++++-- .../bin/internal/madevent_interface.py | 2 +- epochX/cudacpp/gg_ttggg.mad/mg5.in | 2 +- .../CODEGEN_cudacpp_gg_ttggg_log.txt | 29 ++- .../gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt | 46 ++-- .../gq_ttq.mad/Cards/proc_card_mg5.dat | 2 +- epochX/cudacpp/gq_ttq.mad/Cards/run_card.dat | 32 ++- .../gq_ttq.mad/Cards/run_card_default.dat | 26 ++- epochX/cudacpp/gq_ttq.mad/Source/make_opts | 1 - .../cudacpp/gq_ttq.mad/bin/internal/banner.py | 6 +- .../gq_ttq.mad/bin/internal/launch_plugin.py | 94 ++++++-- .../bin/internal/madevent_interface.py | 2 +- epochX/cudacpp/gq_ttq.mad/mg5.in | 2 +- .../gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt | 35 ++- .../CODEGEN_cudacpp_heft_gg_h_log.txt | 23 +- .../CODEGEN_mad_pp_tt012j_log.txt | 216 +++++++++--------- .../pp_tt012j.mad/Cards/proc_card_mg5.dat | 2 +- .../cudacpp/pp_tt012j.mad/Cards/run_card.dat | 32 ++- .../pp_tt012j.mad/Cards/run_card_default.dat | 26 ++- epochX/cudacpp/pp_tt012j.mad/Source/make_opts | 1 - .../pp_tt012j.mad/bin/internal/banner.py | 6 +- .../bin/internal/launch_plugin.py | 94 ++++++-- .../bin/internal/madevent_interface.py | 2 +- epochX/cudacpp/pp_tt012j.mad/mg5.in | 2 +- 71 files changed, 1234 insertions(+), 605 deletions(-) diff --git a/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt b/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt index e6546f684c..e090137829 100644 --- a/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt +++ b/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt @@ -62,7 +62,7 @@ generate e+ e- > mu+ mu- No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005372047424316406  +DEBUG: model prefixing takes 0.005680561065673828  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -156,15 +156,15 @@ INFO: Trying process: e+ e- > mu+ mu- WEIGHTED<=4 @1 INFO: Process has 2 diagrams 1 processes with 2 diagrams generated in 0.004 s Total: 1 processes with 2 diagrams -output madevent ../TMPOUT/CODEGEN_mad_ee_mumu --hel_recycling=False --vector_size=16384 --me_exporter=standalone_cudacpp +output madevent ../TMPOUT/CODEGEN_mad_ee_mumu --hel_recycling=False --vector_size=32 --me_exporter=standalone_cudacpp Load PLUGIN.CUDACPP_OUTPUT Addition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT Output will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: cformat =  standalone_cudacpp [export_cpp.py at line 3071]  -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 155]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 160]  INFO: initialize a new directory: CODEGEN_mad_ee_mumu INFO: remove old information in CODEGEN_mad_ee_mumu -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 160]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 165]  WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu  INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards  @@ -174,7 +174,7 @@ INFO: Generating Helas calls for process: e+ e- > mu+ mu- WEIGHTED<=4 @1 INFO: Processing color information for process: e+ e- > mu+ mu- @1 INFO: Creating files in directory P1_epem_mupmum DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -183,27 +183,27 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: config_map =  [1, 2] [export_cpp.py at line 711]  DEBUG: subproc_number =  0 [export_cpp.py at line 712]  DEBUG: Done [export_cpp.py at line 713]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872]  WARNING: vector code for lepton pdf not implemented. We removed the option to run dressed lepton  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872]  INFO: Generating Feynman diagrams for Process: e+ e- > mu+ mu- WEIGHTED<=4 @1 INFO: Finding symmetric diagrams for subprocess group epem_mupmum Generated helas calls for 1 subprocesses (2 diagrams) in 0.004 s -Wrote files for 8 helas calls in 0.098 s +Wrote files for 8 helas calls in 0.097 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFV2 routines ALOHA: aloha creates FFV4 routines -ALOHA: aloha creates 3 routines in 0.200 s -DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 197]  +ALOHA: aloha creates 3 routines in 0.199 s +DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFV2 routines ALOHA: aloha creates FFV4 routines ALOHA: aloha creates FFV2_4 routines -ALOHA: aloha creates 7 routines in 0.255 s +ALOHA: aloha creates 7 routines in 0.254 s FFV1 FFV1 FFV2 @@ -226,7 +226,6 @@ save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CO INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate web pages -DEBUG: 'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) =  Entering PLUGIN_ProcessExporter.finalize True [output.py at line 206]  DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common patching file Source/genps.inc patching file Source/makefile @@ -242,16 +241,16 @@ patching file matrix1.f Hunk #3 succeeded at 230 (offset 9 lines). Hunk #4 succeeded at 267 (offset 18 lines). Hunk #5 succeeded at 312 (offset 18 lines). -DEBUG: p.returncode =  0 [output.py at line 232]  +DEBUG: p.returncode =  0 [output.py at line 237]  Output to directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu done. Type "launch" to generate events from this process, or see /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/README Run "open index.html" to see more information about this process. quit -real 0m4.853s -user 0m1.653s -sys 0m0.201s +real 0m1.848s +user 0m1.621s +sys 0m0.225s ************************************************************ * * * W E L C O M E to * @@ -279,10 +278,6 @@ Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt No valid web browser found. Please set in ./input/mg5_configuration.txt treatcards run -run_card missed argument cudacpp_backend. Takes default: CPP -run_card missed argument cudacpp_backend. Takes default: CPP -run_card missed argument cudacpp_backend. Takes default: CPP -WARNING! CPPRunCard instance has no attribute path quit INFO: launch in debug mode diff --git a/epochX/cudacpp/ee_mumu.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/ee_mumu.mad/Cards/proc_card_mg5.dat index 618adbca06..22e76563ab 100644 --- a/epochX/cudacpp/ee_mumu.mad/Cards/proc_card_mg5.dat +++ b/epochX/cudacpp/ee_mumu.mad/Cards/proc_card_mg5.dat @@ -46,4 +46,4 @@ define l- = e- mu- define vl = ve vm vt define vl~ = ve~ vm~ vt~ output madevent ../TMPOUT/CODEGEN_mad_ee_mumu --hel_recycling=False --\ -vector_size=16384 --me_exporter=standalone_cudacpp +vector_size=32 --me_exporter=standalone_cudacpp diff --git a/epochX/cudacpp/ee_mumu.mad/Cards/run_card.dat b/epochX/cudacpp/ee_mumu.mad/Cards/run_card.dat index 86f6a33258..1084532333 100644 --- a/epochX/cudacpp/ee_mumu.mad/Cards/run_card.dat +++ b/epochX/cudacpp/ee_mumu.mad/Cards/run_card.dat @@ -85,7 +85,23 @@ 1 = sde_strategy ! default integration strategy (hep-ph/2021.00773) ! 1 is old strategy (using amp square) ! 2 is new strategy (using only the denominator) -# To see advanced option for Phase-Space optimization: type "update psoptim" +#********************************************************************* +# Phase-Space Optim (advanced) +#********************************************************************* + 0 = job_strategy ! see appendix of 1507.00020 (page 26) + 0 = hard_survey ! force to have better estimate of the integral at survey for difficult mode like interference + -1.0 = tmin_for_channel ! limit the non-singular reach of --some-- channel of integration related to T-channel diagram (value between -1 and 0), -1 is no impact + -1 = survey_splitting ! for loop-induced control how many core are used at survey for the computation of a single iteration. + 2 = survey_nchannel_per_job ! control how many Channel are integrated inside a single job on cluster/multicore + -1 = refine_evt_by_job ! control the maximal number of events for the first iteration of the refine (larger means less jobs) +#********************************************************************* +# Compilation flag. +#********************************************************************* + -O3 -ffast-math -fbounds-check = global_flag ! build flags for all Fortran code (for a fair comparison to cudacpp; default is -O) + --fast-math = aloha_flag ! fortran optimization flag for aloha function. Suggestions: '-ffast-math' + -O3 = matrix_flag ! fortran optimization flag for matrix.f function. Suggestions: '-O3' + 16384 = vector_size ! size of fortran arrays allocated in the multi-event API for SIMD/GPU (VECSIZE_MEMMAX) + #********************************************************************* # Customization (custom cuts/scale/bias/...) * # list of files containing fortran function that overwrite default * @@ -182,12 +198,10 @@ systematics = systematics_program ! none, systematics [python], SysCalc [depreceted, C++] ['--mur=0.5,1,2', '--muf=0.5,1,2', '--pdf=errorset'] = systematics_arguments ! see: https://cp3.irmp.ucl.ac.be/projects/madgraph/wiki/Systematics#Systematicspythonmodule -#********************************************************************* -# Options for the cudacpp plugin -#********************************************************************* - -# Set cudacpp-specific values of non-cudacpp-specific options --O3 -ffast-math -fbounds-check = global_flag ! build flags for Fortran code (for a fair comparison to cudacpp) +#*********************************************************************** +# SIMD/GPU configuration for the CUDACPP plugin +#************************************************************************ + d = floating_type ! floating point precision: f (single), d (double), m (mixed: double for amplitudes, single for colors) + auto = avx_level ! SIMD vectorization level: none, sse4, avx2, 512y, 512z, auto + CPP = cudacpp_backend ! CUDACPP backend: FORTRAN, CPP, CUDA -# New cudacpp-specific options (default values are defined in banner.py) -CPP = cudacpp_backend ! valid backends are FORTRAN, CPP, CUDA diff --git a/epochX/cudacpp/ee_mumu.mad/Cards/run_card_default.dat b/epochX/cudacpp/ee_mumu.mad/Cards/run_card_default.dat index 67c1a4de28..0fa0402261 100644 --- a/epochX/cudacpp/ee_mumu.mad/Cards/run_card_default.dat +++ b/epochX/cudacpp/ee_mumu.mad/Cards/run_card_default.dat @@ -85,7 +85,23 @@ 2 = sde_strategy ! default integration strategy (hep-ph/2021.00773) ! 1 is old strategy (using amp square) ! 2 is new strategy (using only the denominator) -# To see advanced option for Phase-Space optimization: type "update psoptim" +#********************************************************************* +# Phase-Space Optim (advanced) +#********************************************************************* + 0 = job_strategy ! see appendix of 1507.00020 (page 26) + 0 = hard_survey ! force to have better estimate of the integral at survey for difficult mode like interference + -1.0 = tmin_for_channel ! limit the non-singular reach of --some-- channel of integration related to T-channel diagram (value between -1 and 0), -1 is no impact + -1 = survey_splitting ! for loop-induced control how many core are used at survey for the computation of a single iteration. + 2 = survey_nchannel_per_job ! control how many Channel are integrated inside a single job on cluster/multicore + -1 = refine_evt_by_job ! control the maximal number of events for the first iteration of the refine (larger means less jobs) +#********************************************************************* +# Compilation flag. +#********************************************************************* + -O = global_flag ! fortran optimization flag use for the all code. + --fast-math = aloha_flag ! fortran optimization flag for aloha function. Suggestions: '-ffast-math' + -O3 = matrix_flag ! fortran optimization flag for matrix.f function. Suggestions: '-O3' + 16 = vector_size ! size of fortran arrays allocated in the multi-event API for SIMD/GPU (VECSIZE_MEMMAX) + #********************************************************************* # Customization (custom cuts/scale/bias/...) * # list of files containing fortran function that overwrite default * @@ -181,3 +197,11 @@ # systematics = systematics_program ! none, systematics [python], SysCalc [depreceted, C++] ['--mur=0.5,1,2', '--muf=0.5,1,2', '--pdf=errorset'] = systematics_arguments ! see: https://cp3.irmp.ucl.ac.be/projects/madgraph/wiki/Systematics#Systematicspythonmodule + +#*********************************************************************** +# SIMD/GPU configuration for the CUDACPP plugin +#************************************************************************ + d = floating_type ! floating point precision: f (single), d (double), m (mixed: double for amplitudes, single for colors) + auto = avx_level ! SIMD vectorization level: none, sse4, avx2, 512y, 512z, auto + CPP = cudacpp_backend ! CUDACPP backend: FORTRAN, CPP, CUDA + diff --git a/epochX/cudacpp/ee_mumu.mad/Source/make_opts b/epochX/cudacpp/ee_mumu.mad/Source/make_opts index 6409e99a49..e4b87ee6ad 100644 --- a/epochX/cudacpp/ee_mumu.mad/Source/make_opts +++ b/epochX/cudacpp/ee_mumu.mad/Source/make_opts @@ -1,7 +1,6 @@ DEFAULT_CPP_COMPILER=g++ DEFAULT_F2PY_COMPILER=f2py3 DEFAULT_F_COMPILER=gfortran -#end_of_make_opts_variables GLOBAL_FLAG=-O3 -ffast-math -fbounds-check MACFLAG= MG5AMC_VERSION=SpecifiedByMG5aMCAtRunTime diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/banner.py b/epochX/cudacpp/ee_mumu.mad/bin/internal/banner.py index 3995ce8109..bd1517985f 100755 --- a/epochX/cudacpp/ee_mumu.mad/bin/internal/banner.py +++ b/epochX/cudacpp/ee_mumu.mad/bin/internal/banner.py @@ -2822,13 +2822,15 @@ def add_param(self, name, value, fortran_name=None, include=True, if fct_mod: self.fct_mod[name] = fct_mod - def read(self, finput, consistency=True, unknown_warning=True): + def read(self, finput, consistency=True, unknown_warning=True, **opt): """Read the input file, this can be a path to a file, a file object, a str with the content of the file.""" if isinstance(finput, str): if "\n" in finput: finput = finput.split('\n') + if 'path' in opt: + self.path = opt['path'] elif os.path.isfile(finput): self.path = finput finput = open(finput) @@ -3913,7 +3915,7 @@ def remove_all_cut(self): %(global_flag)s = global_flag ! fortran optimization flag use for the all code. %(aloha_flag)s = aloha_flag ! fortran optimization flag for aloha function. Suggestions: '-ffast-math' %(matrix_flag)s = matrix_flag ! fortran optimization flag for matrix.f function. Suggestions: '-O3' - %(vector_size)s = vector_size ! size designed for SIMD/OpenMP/GPU (number of events in lockstep) + %(vector_size)s = vector_size ! size of fortran arrays allocated in the multi-event API for SIMD/GPU (VECSIZE_MEMMAX) """ template_off = '# To see advanced option for Phase-Space optimization: type "update psoptim"' diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/ee_mumu.mad/bin/internal/launch_plugin.py index 0b849330ef..3b09713e12 100644 --- a/epochX/cudacpp/ee_mumu.mad/bin/internal/launch_plugin.py +++ b/epochX/cudacpp/ee_mumu.mad/bin/internal/launch_plugin.py @@ -16,22 +16,28 @@ import internal.misc as misc import internal.extended_cmd as extended_cmd import internal.banner as banner_mod + import internal.common_run_interface as common_run_interface else: import madgraph.interface.madevent_interface as madevent_interface import madgraph.various.misc as misc import madgraph.interface.extended_cmd as extended_cmd import madgraph.various.banner as banner_mod + import madgraph.interface.common_run_interface as common_run_interface class CPPMEInterface(madevent_interface.MadEventCmdShell): def compile(self, *args, **opts): """ """ import multiprocessing if not self.options['nb_core'] or self.options['nb_core'] == 'None': - self.options['nb_core'] = multiprocessing.cpu_count() - if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'): - import pathlib - import os - pjoin = os.path.join + self.options['nb_core'] = multiprocessing.cpu_count() + if 'cwd' in opts and os.path.basename(opts['cwd']) == 'Source': + path = pjoin(opts['cwd'], 'make_opts') + avx_level = self.run_card['avx_level'] if self.run_card['avx_level'] != 'auto' else '' + common_run_interface.CommonRunCmd.update_make_opts_full(path, + {'FPTYPE': self.run_card['floating_type'], + 'AVX': avx_level }) + misc.sprint('FPTYPE checked') + if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'): cudacpp_backend = self.run_card['cudacpp_backend'].upper() # the default value is defined in banner.py logger.info("Building madevent in madevent_interface.py with '%s' matrix elements"%cudacpp_backend) if cudacpp_backend == 'FORTRAN': @@ -46,30 +52,83 @@ def compile(self, *args, **opts): else: return misc.compile(nb_core=self.options['nb_core'], *args, **opts) +# Phase-Space Optimization ------------------------------------------------------------------------------------ +template_on = \ +"""#*********************************************************************** +# SIMD/GPU configuration for the CUDACPP plugin +#************************************************************************ + %(floating_type)s = floating_type ! floating point precision: f (single), d (double), m (mixed: double for amplitudes, single for colors) + %(avx_level)s = avx_level ! SIMD vectorization level: none, sse4, avx2, 512y, 512z, auto + %(cudacpp_backend)s = cudacpp_backend ! CUDACPP backend: FORTRAN, CPP, CUDA +""" + +template_off = '' +plugin_block = banner_mod.RunBlock('simd', template_on=template_on, template_off=template_off) + class CPPRunCard(banner_mod.RunCardLO): + blocks = banner_mod.RunCardLO.blocks + [plugin_block] + def reset_simd(self, old_value, new_value, name): if not hasattr(self, 'path'): - logger.warning('WARNING! CPPRunCard instance has no attribute path') - return - ###raise Exception('INTERNAL ERROR! CPPRunCard instance has no attribute path') + raise Exception('INTERNAL ERROR! CPPRunCard instance has no attribute path') # now ok after fixing #790 if name == "vector_size" and new_value <= int(old_value): # code can handle the new size -> do not recompile return Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source') subprocess.call(['make', 'cleanavx'], cwd=Sourcedir, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) + def reset_makeopts(self, old_value, new_value, name): + if not hasattr(self, 'path'): + raise Exception + avx_value = self['avx_level'] if self['avx_level'] != 'auto' else '' + if name == 'floating_type': + common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': new_value, 'AVX': avx_value}) + elif name == 'avx_level': + if new_value == 'auto': new_value = '' + common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': self['floating_type'], 'AVX': new_value}) + else: + raise Exception + Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source') + subprocess.call(['make', 'cleanavx'], cwd=Sourcedir, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) + def plugin_input(self, finput): return def default_setup(self): super().default_setup() - self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False) + self.add_param('floating_type', 'd', include=False, hidden=False, + fct_mod=(self.reset_makeopts,(),{}), + allowed=['m','d','f']) + self.add_param('avx_level', 'auto', include=False, hidden=False, + fct_mod=(self.reset_makeopts,(),{}), + allowed=['auto', 'none', 'sse4', 'avx2','512y','512z']) + self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False, + allowed=['Fortan', 'CPP', 'CUDA']) + self['vector_size'] = 16 # already setup in default class (just change value) + self['aloha_flag'] = '--fast-math' + self['matrix_flag'] = '-O3' + self.display_block.append('simd') + self.display_block.append('psoptim') + # OM/AV - overload the default version in banner.py def write_one_include_file(self, output_dir, incname, output_file=None): """write one include file at the time""" - if incname == "vector.inc" and 'vector_size' not in self.user_set: - return - super().write_one_include_file(output_dir, incname, output_file) + if incname == "vector.inc": + if 'vector_size' not in self.user_set: return + if output_file is None: vectorinc=pjoin(output_dir,incname) + else: vectorinc=output_file + with open(vectorinc+'.new','w') as fileout: + with open(vectorinc) as filein: + for line in filein: + if line.startswith('C'): fileout.write(line) + super().write_one_include_file(output_dir, incname, output_file) + with open(vectorinc+'.new','a') as fileout: + with open(vectorinc) as filein: + for line in filein: + if not line.startswith('\n'): fileout.write(line) + os.replace(vectorinc+'.new',vectorinc) + else: + super().write_one_include_file(output_dir, incname, output_file) def check_validity(self): """ensure that PLUGIN information are consistent""" @@ -82,13 +141,10 @@ def check_validity(self): class GPURunCard(CPPRunCard): def default_setup(self): - super(CPPRunCard, self).default_setup() - self.add_param('cudacpp_backend', 'CUDA', include=False, hidden=False) - -#class CUDACPPRunCard(CPPRunCard): -# def default_setup(self): -# super(CPPRunCard, self).default_setup() -# self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False) + super().default_setup() + # change default value: + self['cudacpp_backend'] = 'CUDA' + self['vector_size'] = 16384 # already setup in default class (just change value) MEINTERFACE = CPPMEInterface RunCard = CPPRunCard diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/madevent_interface.py b/epochX/cudacpp/ee_mumu.mad/bin/internal/madevent_interface.py index 853aabc98a..cb6bf4ca57 100755 --- a/epochX/cudacpp/ee_mumu.mad/bin/internal/madevent_interface.py +++ b/epochX/cudacpp/ee_mumu.mad/bin/internal/madevent_interface.py @@ -3191,7 +3191,7 @@ def do_treatcards(self, line, mode=None, opt=None): if mode in ['run', 'all']: if not hasattr(self, 'run_card'): - run_card = banner_mod.RunCard(opt['run_card']) + run_card = banner_mod.RunCard(opt['run_card'], path=pjoin(self.me_dir, 'Cards', 'run_card.dat')) else: run_card = self.run_card self.run_card = run_card diff --git a/epochX/cudacpp/ee_mumu.mad/mg5.in b/epochX/cudacpp/ee_mumu.mad/mg5.in index 12a2c58512..4e83015b40 100644 --- a/epochX/cudacpp/ee_mumu.mad/mg5.in +++ b/epochX/cudacpp/ee_mumu.mad/mg5.in @@ -1,4 +1,4 @@ set stdout_level DEBUG set zerowidth_tchannel F generate e+ e- > mu+ mu- -output madevent ee_mumu.mad --hel_recycling=False --vector_size=16384 --me_exporter=standalone_cudacpp +output madevent ee_mumu.mad --hel_recycling=False --vector_size=32 --me_exporter=standalone_cudacpp diff --git a/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt b/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt index 8cb80f0d38..3447ea61e3 100644 --- a/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt +++ b/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt @@ -62,7 +62,7 @@ generate e+ e- > mu+ mu- No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005633831024169922  +DEBUG: model prefixing takes 0.0055027008056640625  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -154,34 +154,34 @@ INFO: Checking for minimal orders which gives processes. INFO: Please specify coupling orders to bypass this step. INFO: Trying process: e+ e- > mu+ mu- WEIGHTED<=4 @1 INFO: Process has 2 diagrams -1 processes with 2 diagrams generated in 0.005 s +1 processes with 2 diagrams generated in 0.004 s Total: 1 processes with 2 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_ee_mumu Load PLUGIN.CUDACPP_OUTPUT Output will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: cformat =  plugin [export_cpp.py at line 3071]  -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 155]  -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 160]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 160]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 165]  INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: e+ e- > mu+ mu- WEIGHTED<=4 @1 INFO: Processing color information for process: e+ e- > mu+ mu- @1 -DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 189]  -DEBUG: type(subproc_group)= [output.py at line 190]  -DEBUG: type(fortran_model)= [output.py at line 191]  -DEBUG: type(me)= me=0 [output.py at line 192]  +DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 194]  +DEBUG: type(subproc_group)= [output.py at line 195]  +DEBUG: type(fortran_model)= [output.py at line 196]  +DEBUG: type(me)= me=0 [output.py at line 197]  INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/./CPPProcess.h FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/./CPPProcess.cc INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/. -Generated helas calls for 1 subprocesses (2 diagrams) in 0.003 s -DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 197]  +Generated helas calls for 1 subprocesses (2 diagrams) in 0.004 s +DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFV2 routines ALOHA: aloha creates FFV4 routines ALOHA: aloha creates FFV2_4 routines -ALOHA: aloha creates 4 routines in 0.267 s +ALOHA: aloha creates 4 routines in 0.270 s FFV1 FFV1 FFV2 @@ -198,9 +198,8 @@ FileWriter for / FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/. -DEBUG: 'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) =  Entering PLUGIN_ProcessExporter.finalize False [output.py at line 206]  quit -real 0m3.653s -user 0m0.601s -sys 0m0.049s +real 0m0.657s +user 0m0.602s +sys 0m0.050s diff --git a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt index 9082dbc054..3326a8488f 100644 --- a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt +++ b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.00536799430847168  +DEBUG: model prefixing takes 0.005394458770751953  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -175,7 +175,7 @@ INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1 INFO: Processing color information for process: g g > t t~ @1 INFO: Creating files in directory P1_gg_ttx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -191,16 +191,16 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. INFO: Generating Feynman diagrams for Process: g g > t t~ WEIGHTED<=2 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttx Generated helas calls for 1 subprocesses (3 diagrams) in 0.006 s -Wrote files for 10 helas calls in 0.100 s +Wrote files for 10 helas calls in 0.102 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 2 routines in 0.143 s +ALOHA: aloha creates 2 routines in 0.142 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 4 routines in 0.130 s +ALOHA: aloha creates 4 routines in 0.138 s VVV1 FFV1 FFV1 @@ -237,9 +237,9 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m1.673s -user 0m1.458s -sys 0m0.207s +real 0m1.677s +user 0m1.453s +sys 0m0.213s ************************************************************ * * * W E L C O M E to * diff --git a/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt b/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt index 805df19bd9..f09b4fa669 100644 --- a/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt +++ b/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.00567626953125  +DEBUG: model prefixing takes 0.00550079345703125  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -161,26 +161,26 @@ output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_tt Load PLUGIN.CUDACPP_OUTPUT Output will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: cformat =  plugin [export_cpp.py at line 3071]  -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 155]  -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 160]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 160]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 165]  INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1 INFO: Processing color information for process: g g > t t~ @1 -DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 189]  -DEBUG: type(subproc_group)= [output.py at line 190]  -DEBUG: type(fortran_model)= [output.py at line 191]  -DEBUG: type(me)= me=0 [output.py at line 192]  +DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 194]  +DEBUG: type(subproc_group)= [output.py at line 195]  +DEBUG: type(fortran_model)= [output.py at line 196]  +DEBUG: type(me)= me=0 [output.py at line 197]  INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/./CPPProcess.h FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/./CPPProcess.cc INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/. Generated helas calls for 1 subprocesses (3 diagrams) in 0.006 s -DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 197]  +DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 2 routines in 0.143 s +ALOHA: aloha creates 2 routines in 0.146 s VVV1 FFV1 FFV1 @@ -193,9 +193,8 @@ FileWriter for / FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. -DEBUG: 'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) =  Entering PLUGIN_ProcessExporter.finalize False [output.py at line 206]  quit -real 0m3.529s -user 0m0.478s -sys 0m0.048s +real 0m0.583s +user 0m0.466s +sys 0m0.061s diff --git a/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt b/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt index 9d4dbd85f0..acacaf4036 100644 --- a/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt +++ b/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005400419235229492  +DEBUG: model prefixing takes 0.0055658817291259766  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -165,15 +165,15 @@ INFO: Trying process: g g > t t~ g WEIGHTED<=3 @2 INFO: Process has 16 diagrams 1 processes with 16 diagrams generated in 0.019 s Total: 2 processes with 19 diagrams -output madevent ../TMPOUT/CODEGEN_mad_gg_tt01g --hel_recycling=False --vector_size=16384 --me_exporter=standalone_cudacpp +output madevent ../TMPOUT/CODEGEN_mad_gg_tt01g --hel_recycling=False --vector_size=32 --me_exporter=standalone_cudacpp Load PLUGIN.CUDACPP_OUTPUT Addition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT Output will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: cformat =  standalone_cudacpp [export_cpp.py at line 3071]  -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 155]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 160]  INFO: initialize a new directory: CODEGEN_mad_gg_tt01g INFO: remove old information in CODEGEN_mad_gg_tt01g -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 160]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 165]  WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g  INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards  @@ -185,7 +185,7 @@ INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1 INFO: Processing color information for process: g g > t t~ @1 INFO: Creating files in directory P2_gg_ttxg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -194,15 +194,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: config_map =  [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0] [export_cpp.py at line 711]  DEBUG: subproc_number =  0 [export_cpp.py at line 712]  DEBUG: Done [export_cpp.py at line 713]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872]  INFO: Generating Feynman diagrams for Process: g g > t t~ g WEIGHTED<=3 @2 INFO: Finding symmetric diagrams for subprocess group gg_ttxg INFO: Creating files in directory P1_gg_ttx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -211,29 +211,29 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: config_map =  [1, 2, 3] [export_cpp.py at line 711]  DEBUG: subproc_number =  1 [export_cpp.py at line 712]  DEBUG: Done [export_cpp.py at line 713]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872]  INFO: Generating Feynman diagrams for Process: g g > t t~ WEIGHTED<=2 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttx Generated helas calls for 2 subprocesses (19 diagrams) in 0.042 s -Wrote files for 46 helas calls in 0.242 s +Wrote files for 46 helas calls in 0.245 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 5 routines in 0.324 s -DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 197]  +ALOHA: aloha creates 5 routines in 0.321 s +DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 10 routines in 0.308 s +ALOHA: aloha creates 10 routines in 0.304 s VVV1 VVV1 FFV1 @@ -257,7 +257,6 @@ save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CO INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate web pages -DEBUG: 'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) =  Entering PLUGIN_ProcessExporter.finalize True [output.py at line 206]  DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common patching file Source/genps.inc patching file Source/makefile @@ -277,16 +276,16 @@ Hunk #2 succeeded at 159 (offset 16 lines). Hunk #3 succeeded at 237 (offset 16 lines). Hunk #4 succeeded at 265 (offset 16 lines). Hunk #5 succeeded at 310 (offset 16 lines). -DEBUG: p.returncode =  0 [output.py at line 232]  +DEBUG: p.returncode =  0 [output.py at line 237]  Output to directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g done. Type "launch" to generate events from this process, or see /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/README Run "open index.html" to see more information about this process. quit -real 0m5.282s -user 0m2.049s -sys 0m0.227s +real 0m2.697s +user 0m2.018s +sys 0m0.247s ************************************************************ * * * W E L C O M E to * @@ -314,9 +313,6 @@ Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt No valid web browser found. Please set in ./input/mg5_configuration.txt treatcards run -run_card missed argument cudacpp_backend. Takes default: CPP -run_card missed argument cudacpp_backend. Takes default: CPP -WARNING! CPPRunCard instance has no attribute path quit INFO: launch in debug mode diff --git a/epochX/cudacpp/gg_tt01g.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/gg_tt01g.mad/Cards/proc_card_mg5.dat index d0845f65f5..06ea2195d8 100644 --- a/epochX/cudacpp/gg_tt01g.mad/Cards/proc_card_mg5.dat +++ b/epochX/cudacpp/gg_tt01g.mad/Cards/proc_card_mg5.dat @@ -47,4 +47,4 @@ define vl = ve vm vt define vl~ = ve~ vm~ vt~ add process g g > t t~ g output madevent ../TMPOUT/CODEGEN_mad_gg_tt01g --hel_recycling=False -\ --vector_size=16384 --me_exporter=standalone_cudacpp +-vector_size=32 --me_exporter=standalone_cudacpp diff --git a/epochX/cudacpp/gg_tt01g.mad/Cards/run_card.dat b/epochX/cudacpp/gg_tt01g.mad/Cards/run_card.dat index 3360456835..f7c6502eec 100644 --- a/epochX/cudacpp/gg_tt01g.mad/Cards/run_card.dat +++ b/epochX/cudacpp/gg_tt01g.mad/Cards/run_card.dat @@ -98,7 +98,23 @@ 1 = sde_strategy ! default integration strategy (hep-ph/2021.00773) ! 1 is old strategy (using amp square) ! 2 is new strategy (using only the denominator) -# To see advanced option for Phase-Space optimization: type "update psoptim" +#********************************************************************* +# Phase-Space Optim (advanced) +#********************************************************************* + 0 = job_strategy ! see appendix of 1507.00020 (page 26) + 0 = hard_survey ! force to have better estimate of the integral at survey for difficult mode like interference + -1.0 = tmin_for_channel ! limit the non-singular reach of --some-- channel of integration related to T-channel diagram (value between -1 and 0), -1 is no impact + -1 = survey_splitting ! for loop-induced control how many core are used at survey for the computation of a single iteration. + 2 = survey_nchannel_per_job ! control how many Channel are integrated inside a single job on cluster/multicore + -1 = refine_evt_by_job ! control the maximal number of events for the first iteration of the refine (larger means less jobs) +#********************************************************************* +# Compilation flag. +#********************************************************************* + -O3 -ffast-math -fbounds-check = global_flag ! build flags for all Fortran code (for a fair comparison to cudacpp; default is -O) + --fast-math = aloha_flag ! fortran optimization flag for aloha function. Suggestions: '-ffast-math' + -O3 = matrix_flag ! fortran optimization flag for matrix.f function. Suggestions: '-O3' + 16384 = vector_size ! size of fortran arrays allocated in the multi-event API for SIMD/GPU (VECSIZE_MEMMAX) + #********************************************************************* # Customization (custom cuts/scale/bias/...) * # list of files containing fortran function that overwrite default * @@ -161,12 +177,10 @@ systematics = systematics_program ! none, systematics [python], SysCalc [depreceted, C++] ['--mur=0.5,1,2', '--muf=0.5,1,2', '--pdf=errorset', '--alps=0.5,1,2'] = systematics_arguments ! see: https://cp3.irmp.ucl.ac.be/projects/madgraph/wiki/Systematics#Systematicspythonmodule -#********************************************************************* -# Options for the cudacpp plugin -#********************************************************************* - -# Set cudacpp-specific values of non-cudacpp-specific options --O3 -ffast-math -fbounds-check = global_flag ! build flags for Fortran code (for a fair comparison to cudacpp) +#*********************************************************************** +# SIMD/GPU configuration for the CUDACPP plugin +#************************************************************************ + d = floating_type ! floating point precision: f (single), d (double), m (mixed: double for amplitudes, single for colors) + auto = avx_level ! SIMD vectorization level: none, sse4, avx2, 512y, 512z, auto + CPP = cudacpp_backend ! CUDACPP backend: FORTRAN, CPP, CUDA -# New cudacpp-specific options (default values are defined in banner.py) -CPP = cudacpp_backend ! valid backends are FORTRAN, CPP, CUDA diff --git a/epochX/cudacpp/gg_tt01g.mad/Cards/run_card_default.dat b/epochX/cudacpp/gg_tt01g.mad/Cards/run_card_default.dat index 46378139c8..0e45d1a8c9 100644 --- a/epochX/cudacpp/gg_tt01g.mad/Cards/run_card_default.dat +++ b/epochX/cudacpp/gg_tt01g.mad/Cards/run_card_default.dat @@ -98,7 +98,23 @@ 2 = sde_strategy ! default integration strategy (hep-ph/2021.00773) ! 1 is old strategy (using amp square) ! 2 is new strategy (using only the denominator) -# To see advanced option for Phase-Space optimization: type "update psoptim" +#********************************************************************* +# Phase-Space Optim (advanced) +#********************************************************************* + 0 = job_strategy ! see appendix of 1507.00020 (page 26) + 0 = hard_survey ! force to have better estimate of the integral at survey for difficult mode like interference + -1.0 = tmin_for_channel ! limit the non-singular reach of --some-- channel of integration related to T-channel diagram (value between -1 and 0), -1 is no impact + -1 = survey_splitting ! for loop-induced control how many core are used at survey for the computation of a single iteration. + 2 = survey_nchannel_per_job ! control how many Channel are integrated inside a single job on cluster/multicore + -1 = refine_evt_by_job ! control the maximal number of events for the first iteration of the refine (larger means less jobs) +#********************************************************************* +# Compilation flag. +#********************************************************************* + -O = global_flag ! fortran optimization flag use for the all code. + --fast-math = aloha_flag ! fortran optimization flag for aloha function. Suggestions: '-ffast-math' + -O3 = matrix_flag ! fortran optimization flag for matrix.f function. Suggestions: '-O3' + 16 = vector_size ! size of fortran arrays allocated in the multi-event API for SIMD/GPU (VECSIZE_MEMMAX) + #********************************************************************* # Customization (custom cuts/scale/bias/...) * # list of files containing fortran function that overwrite default * @@ -160,3 +176,11 @@ # systematics = systematics_program ! none, systematics [python], SysCalc [depreceted, C++] ['--mur=0.5,1,2', '--muf=0.5,1,2', '--pdf=errorset', '--alps=0.5,1,2'] = systematics_arguments ! see: https://cp3.irmp.ucl.ac.be/projects/madgraph/wiki/Systematics#Systematicspythonmodule + +#*********************************************************************** +# SIMD/GPU configuration for the CUDACPP plugin +#************************************************************************ + d = floating_type ! floating point precision: f (single), d (double), m (mixed: double for amplitudes, single for colors) + auto = avx_level ! SIMD vectorization level: none, sse4, avx2, 512y, 512z, auto + CPP = cudacpp_backend ! CUDACPP backend: FORTRAN, CPP, CUDA + diff --git a/epochX/cudacpp/gg_tt01g.mad/Source/make_opts b/epochX/cudacpp/gg_tt01g.mad/Source/make_opts index 6409e99a49..e4b87ee6ad 100644 --- a/epochX/cudacpp/gg_tt01g.mad/Source/make_opts +++ b/epochX/cudacpp/gg_tt01g.mad/Source/make_opts @@ -1,7 +1,6 @@ DEFAULT_CPP_COMPILER=g++ DEFAULT_F2PY_COMPILER=f2py3 DEFAULT_F_COMPILER=gfortran -#end_of_make_opts_variables GLOBAL_FLAG=-O3 -ffast-math -fbounds-check MACFLAG= MG5AMC_VERSION=SpecifiedByMG5aMCAtRunTime diff --git a/epochX/cudacpp/gg_tt01g.mad/bin/internal/banner.py b/epochX/cudacpp/gg_tt01g.mad/bin/internal/banner.py index 3995ce8109..bd1517985f 100755 --- a/epochX/cudacpp/gg_tt01g.mad/bin/internal/banner.py +++ b/epochX/cudacpp/gg_tt01g.mad/bin/internal/banner.py @@ -2822,13 +2822,15 @@ def add_param(self, name, value, fortran_name=None, include=True, if fct_mod: self.fct_mod[name] = fct_mod - def read(self, finput, consistency=True, unknown_warning=True): + def read(self, finput, consistency=True, unknown_warning=True, **opt): """Read the input file, this can be a path to a file, a file object, a str with the content of the file.""" if isinstance(finput, str): if "\n" in finput: finput = finput.split('\n') + if 'path' in opt: + self.path = opt['path'] elif os.path.isfile(finput): self.path = finput finput = open(finput) @@ -3913,7 +3915,7 @@ def remove_all_cut(self): %(global_flag)s = global_flag ! fortran optimization flag use for the all code. %(aloha_flag)s = aloha_flag ! fortran optimization flag for aloha function. Suggestions: '-ffast-math' %(matrix_flag)s = matrix_flag ! fortran optimization flag for matrix.f function. Suggestions: '-O3' - %(vector_size)s = vector_size ! size designed for SIMD/OpenMP/GPU (number of events in lockstep) + %(vector_size)s = vector_size ! size of fortran arrays allocated in the multi-event API for SIMD/GPU (VECSIZE_MEMMAX) """ template_off = '# To see advanced option for Phase-Space optimization: type "update psoptim"' diff --git a/epochX/cudacpp/gg_tt01g.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/gg_tt01g.mad/bin/internal/launch_plugin.py index 0b849330ef..3b09713e12 100644 --- a/epochX/cudacpp/gg_tt01g.mad/bin/internal/launch_plugin.py +++ b/epochX/cudacpp/gg_tt01g.mad/bin/internal/launch_plugin.py @@ -16,22 +16,28 @@ import internal.misc as misc import internal.extended_cmd as extended_cmd import internal.banner as banner_mod + import internal.common_run_interface as common_run_interface else: import madgraph.interface.madevent_interface as madevent_interface import madgraph.various.misc as misc import madgraph.interface.extended_cmd as extended_cmd import madgraph.various.banner as banner_mod + import madgraph.interface.common_run_interface as common_run_interface class CPPMEInterface(madevent_interface.MadEventCmdShell): def compile(self, *args, **opts): """ """ import multiprocessing if not self.options['nb_core'] or self.options['nb_core'] == 'None': - self.options['nb_core'] = multiprocessing.cpu_count() - if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'): - import pathlib - import os - pjoin = os.path.join + self.options['nb_core'] = multiprocessing.cpu_count() + if 'cwd' in opts and os.path.basename(opts['cwd']) == 'Source': + path = pjoin(opts['cwd'], 'make_opts') + avx_level = self.run_card['avx_level'] if self.run_card['avx_level'] != 'auto' else '' + common_run_interface.CommonRunCmd.update_make_opts_full(path, + {'FPTYPE': self.run_card['floating_type'], + 'AVX': avx_level }) + misc.sprint('FPTYPE checked') + if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'): cudacpp_backend = self.run_card['cudacpp_backend'].upper() # the default value is defined in banner.py logger.info("Building madevent in madevent_interface.py with '%s' matrix elements"%cudacpp_backend) if cudacpp_backend == 'FORTRAN': @@ -46,30 +52,83 @@ def compile(self, *args, **opts): else: return misc.compile(nb_core=self.options['nb_core'], *args, **opts) +# Phase-Space Optimization ------------------------------------------------------------------------------------ +template_on = \ +"""#*********************************************************************** +# SIMD/GPU configuration for the CUDACPP plugin +#************************************************************************ + %(floating_type)s = floating_type ! floating point precision: f (single), d (double), m (mixed: double for amplitudes, single for colors) + %(avx_level)s = avx_level ! SIMD vectorization level: none, sse4, avx2, 512y, 512z, auto + %(cudacpp_backend)s = cudacpp_backend ! CUDACPP backend: FORTRAN, CPP, CUDA +""" + +template_off = '' +plugin_block = banner_mod.RunBlock('simd', template_on=template_on, template_off=template_off) + class CPPRunCard(banner_mod.RunCardLO): + blocks = banner_mod.RunCardLO.blocks + [plugin_block] + def reset_simd(self, old_value, new_value, name): if not hasattr(self, 'path'): - logger.warning('WARNING! CPPRunCard instance has no attribute path') - return - ###raise Exception('INTERNAL ERROR! CPPRunCard instance has no attribute path') + raise Exception('INTERNAL ERROR! CPPRunCard instance has no attribute path') # now ok after fixing #790 if name == "vector_size" and new_value <= int(old_value): # code can handle the new size -> do not recompile return Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source') subprocess.call(['make', 'cleanavx'], cwd=Sourcedir, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) + def reset_makeopts(self, old_value, new_value, name): + if not hasattr(self, 'path'): + raise Exception + avx_value = self['avx_level'] if self['avx_level'] != 'auto' else '' + if name == 'floating_type': + common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': new_value, 'AVX': avx_value}) + elif name == 'avx_level': + if new_value == 'auto': new_value = '' + common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': self['floating_type'], 'AVX': new_value}) + else: + raise Exception + Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source') + subprocess.call(['make', 'cleanavx'], cwd=Sourcedir, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) + def plugin_input(self, finput): return def default_setup(self): super().default_setup() - self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False) + self.add_param('floating_type', 'd', include=False, hidden=False, + fct_mod=(self.reset_makeopts,(),{}), + allowed=['m','d','f']) + self.add_param('avx_level', 'auto', include=False, hidden=False, + fct_mod=(self.reset_makeopts,(),{}), + allowed=['auto', 'none', 'sse4', 'avx2','512y','512z']) + self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False, + allowed=['Fortan', 'CPP', 'CUDA']) + self['vector_size'] = 16 # already setup in default class (just change value) + self['aloha_flag'] = '--fast-math' + self['matrix_flag'] = '-O3' + self.display_block.append('simd') + self.display_block.append('psoptim') + # OM/AV - overload the default version in banner.py def write_one_include_file(self, output_dir, incname, output_file=None): """write one include file at the time""" - if incname == "vector.inc" and 'vector_size' not in self.user_set: - return - super().write_one_include_file(output_dir, incname, output_file) + if incname == "vector.inc": + if 'vector_size' not in self.user_set: return + if output_file is None: vectorinc=pjoin(output_dir,incname) + else: vectorinc=output_file + with open(vectorinc+'.new','w') as fileout: + with open(vectorinc) as filein: + for line in filein: + if line.startswith('C'): fileout.write(line) + super().write_one_include_file(output_dir, incname, output_file) + with open(vectorinc+'.new','a') as fileout: + with open(vectorinc) as filein: + for line in filein: + if not line.startswith('\n'): fileout.write(line) + os.replace(vectorinc+'.new',vectorinc) + else: + super().write_one_include_file(output_dir, incname, output_file) def check_validity(self): """ensure that PLUGIN information are consistent""" @@ -82,13 +141,10 @@ def check_validity(self): class GPURunCard(CPPRunCard): def default_setup(self): - super(CPPRunCard, self).default_setup() - self.add_param('cudacpp_backend', 'CUDA', include=False, hidden=False) - -#class CUDACPPRunCard(CPPRunCard): -# def default_setup(self): -# super(CPPRunCard, self).default_setup() -# self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False) + super().default_setup() + # change default value: + self['cudacpp_backend'] = 'CUDA' + self['vector_size'] = 16384 # already setup in default class (just change value) MEINTERFACE = CPPMEInterface RunCard = CPPRunCard diff --git a/epochX/cudacpp/gg_tt01g.mad/bin/internal/madevent_interface.py b/epochX/cudacpp/gg_tt01g.mad/bin/internal/madevent_interface.py index 853aabc98a..cb6bf4ca57 100755 --- a/epochX/cudacpp/gg_tt01g.mad/bin/internal/madevent_interface.py +++ b/epochX/cudacpp/gg_tt01g.mad/bin/internal/madevent_interface.py @@ -3191,7 +3191,7 @@ def do_treatcards(self, line, mode=None, opt=None): if mode in ['run', 'all']: if not hasattr(self, 'run_card'): - run_card = banner_mod.RunCard(opt['run_card']) + run_card = banner_mod.RunCard(opt['run_card'], path=pjoin(self.me_dir, 'Cards', 'run_card.dat')) else: run_card = self.run_card self.run_card = run_card diff --git a/epochX/cudacpp/gg_tt01g.mad/mg5.in b/epochX/cudacpp/gg_tt01g.mad/mg5.in index a20e166e81..95984fcf10 100644 --- a/epochX/cudacpp/gg_tt01g.mad/mg5.in +++ b/epochX/cudacpp/gg_tt01g.mad/mg5.in @@ -2,4 +2,4 @@ set stdout_level DEBUG set zerowidth_tchannel F generate g g > t t~ add process g g > t t~ g -output madevent gg_tt01g.mad --hel_recycling=False --vector_size=16384 --me_exporter=standalone_cudacpp +output madevent gg_tt01g.mad --hel_recycling=False --vector_size=32 --me_exporter=standalone_cudacpp diff --git a/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt b/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt index 68afa8d9b0..b52dc31122 100644 --- a/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt +++ b/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005378007888793945  +DEBUG: model prefixing takes 0.0053288936614990234  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -157,15 +157,15 @@ INFO: Trying process: g g > t t~ g WEIGHTED<=3 @1 INFO: Process has 16 diagrams 1 processes with 16 diagrams generated in 0.021 s Total: 1 processes with 16 diagrams -output madevent ../TMPOUT/CODEGEN_mad_gg_ttg --hel_recycling=False --vector_size=16384 --me_exporter=standalone_cudacpp +output madevent ../TMPOUT/CODEGEN_mad_gg_ttg --hel_recycling=False --vector_size=32 --me_exporter=standalone_cudacpp Load PLUGIN.CUDACPP_OUTPUT Addition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT Output will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: cformat =  standalone_cudacpp [export_cpp.py at line 3071]  -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 155]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 160]  INFO: initialize a new directory: CODEGEN_mad_gg_ttg INFO: remove old information in CODEGEN_mad_gg_ttg -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 160]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 165]  WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg  INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards  @@ -175,7 +175,7 @@ INFO: Generating Helas calls for process: g g > t t~ g WEIGHTED<=3 @1 INFO: Processing color information for process: g g > t t~ g @1 INFO: Creating files in directory P1_gg_ttxg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -184,29 +184,29 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: config_map =  [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0] [export_cpp.py at line 711]  DEBUG: subproc_number =  0 [export_cpp.py at line 712]  DEBUG: Done [export_cpp.py at line 713]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872]  INFO: Generating Feynman diagrams for Process: g g > t t~ g WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttxg -Generated helas calls for 1 subprocesses (16 diagrams) in 0.038 s -Wrote files for 36 helas calls in 0.148 s +Generated helas calls for 1 subprocesses (16 diagrams) in 0.037 s +Wrote files for 36 helas calls in 0.155 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 5 routines in 0.323 s -DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 197]  +ALOHA: aloha creates 5 routines in 0.326 s +DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 10 routines in 0.309 s +ALOHA: aloha creates 10 routines in 0.327 s VVV1 VVV1 FFV1 @@ -230,7 +230,6 @@ save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CO INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate web pages -DEBUG: 'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) =  Entering PLUGIN_ProcessExporter.finalize True [output.py at line 206]  DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common patching file Source/genps.inc patching file Source/makefile @@ -246,16 +245,16 @@ Hunk #2 succeeded at 159 (offset 16 lines). Hunk #3 succeeded at 237 (offset 16 lines). Hunk #4 succeeded at 265 (offset 16 lines). Hunk #5 succeeded at 310 (offset 16 lines). -DEBUG: p.returncode =  0 [output.py at line 232]  +DEBUG: p.returncode =  0 [output.py at line 237]  Output to directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg done. Type "launch" to generate events from this process, or see /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/README Run "open index.html" to see more information about this process. quit -real 0m5.147s -user 0m1.924s -sys 0m0.225s +real 0m2.189s +user 0m1.943s +sys 0m0.223s ************************************************************ * * * W E L C O M E to * @@ -283,9 +282,6 @@ Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt No valid web browser found. Please set in ./input/mg5_configuration.txt treatcards run -run_card missed argument cudacpp_backend. Takes default: CPP -run_card missed argument cudacpp_backend. Takes default: CPP -WARNING! CPPRunCard instance has no attribute path quit INFO: launch in debug mode diff --git a/epochX/cudacpp/gg_ttg.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/gg_ttg.mad/Cards/proc_card_mg5.dat index a0ffbbc219..9d09090869 100644 --- a/epochX/cudacpp/gg_ttg.mad/Cards/proc_card_mg5.dat +++ b/epochX/cudacpp/gg_ttg.mad/Cards/proc_card_mg5.dat @@ -46,4 +46,4 @@ define l- = e- mu- define vl = ve vm vt define vl~ = ve~ vm~ vt~ output madevent ../TMPOUT/CODEGEN_mad_gg_ttg --hel_recycling=False --v\ -ector_size=16384 --me_exporter=standalone_cudacpp +ector_size=32 --me_exporter=standalone_cudacpp diff --git a/epochX/cudacpp/gg_ttg.mad/Cards/run_card.dat b/epochX/cudacpp/gg_ttg.mad/Cards/run_card.dat index a9fbb5a212..f119b5a1c7 100644 --- a/epochX/cudacpp/gg_ttg.mad/Cards/run_card.dat +++ b/epochX/cudacpp/gg_ttg.mad/Cards/run_card.dat @@ -80,7 +80,23 @@ 1 = sde_strategy ! default integration strategy (hep-ph/2021.00773) ! 1 is old strategy (using amp square) ! 2 is new strategy (using only the denominator) -# To see advanced option for Phase-Space optimization: type "update psoptim" +#********************************************************************* +# Phase-Space Optim (advanced) +#********************************************************************* + 0 = job_strategy ! see appendix of 1507.00020 (page 26) + 0 = hard_survey ! force to have better estimate of the integral at survey for difficult mode like interference + -1.0 = tmin_for_channel ! limit the non-singular reach of --some-- channel of integration related to T-channel diagram (value between -1 and 0), -1 is no impact + -1 = survey_splitting ! for loop-induced control how many core are used at survey for the computation of a single iteration. + 2 = survey_nchannel_per_job ! control how many Channel are integrated inside a single job on cluster/multicore + -1 = refine_evt_by_job ! control the maximal number of events for the first iteration of the refine (larger means less jobs) +#********************************************************************* +# Compilation flag. +#********************************************************************* + -O3 -ffast-math -fbounds-check = global_flag ! build flags for all Fortran code (for a fair comparison to cudacpp; default is -O) + --fast-math = aloha_flag ! fortran optimization flag for aloha function. Suggestions: '-ffast-math' + -O3 = matrix_flag ! fortran optimization flag for matrix.f function. Suggestions: '-O3' + 16384 = vector_size ! size of fortran arrays allocated in the multi-event API for SIMD/GPU (VECSIZE_MEMMAX) + #********************************************************************* # Customization (custom cuts/scale/bias/...) * # list of files containing fortran function that overwrite default * @@ -143,12 +159,10 @@ systematics = systematics_program ! none, systematics [python], SysCalc [depreceted, C++] ['--mur=0.5,1,2', '--muf=0.5,1,2', '--pdf=errorset'] = systematics_arguments ! see: https://cp3.irmp.ucl.ac.be/projects/madgraph/wiki/Systematics#Systematicspythonmodule -#********************************************************************* -# Options for the cudacpp plugin -#********************************************************************* - -# Set cudacpp-specific values of non-cudacpp-specific options --O3 -ffast-math -fbounds-check = global_flag ! build flags for Fortran code (for a fair comparison to cudacpp) +#*********************************************************************** +# SIMD/GPU configuration for the CUDACPP plugin +#************************************************************************ + d = floating_type ! floating point precision: f (single), d (double), m (mixed: double for amplitudes, single for colors) + auto = avx_level ! SIMD vectorization level: none, sse4, avx2, 512y, 512z, auto + CPP = cudacpp_backend ! CUDACPP backend: FORTRAN, CPP, CUDA -# New cudacpp-specific options (default values are defined in banner.py) -CPP = cudacpp_backend ! valid backends are FORTRAN, CPP, CUDA diff --git a/epochX/cudacpp/gg_ttg.mad/Cards/run_card_default.dat b/epochX/cudacpp/gg_ttg.mad/Cards/run_card_default.dat index 1464f610ba..e89c78702d 100644 --- a/epochX/cudacpp/gg_ttg.mad/Cards/run_card_default.dat +++ b/epochX/cudacpp/gg_ttg.mad/Cards/run_card_default.dat @@ -80,7 +80,23 @@ 2 = sde_strategy ! default integration strategy (hep-ph/2021.00773) ! 1 is old strategy (using amp square) ! 2 is new strategy (using only the denominator) -# To see advanced option for Phase-Space optimization: type "update psoptim" +#********************************************************************* +# Phase-Space Optim (advanced) +#********************************************************************* + 0 = job_strategy ! see appendix of 1507.00020 (page 26) + 0 = hard_survey ! force to have better estimate of the integral at survey for difficult mode like interference + -1.0 = tmin_for_channel ! limit the non-singular reach of --some-- channel of integration related to T-channel diagram (value between -1 and 0), -1 is no impact + -1 = survey_splitting ! for loop-induced control how many core are used at survey for the computation of a single iteration. + 2 = survey_nchannel_per_job ! control how many Channel are integrated inside a single job on cluster/multicore + -1 = refine_evt_by_job ! control the maximal number of events for the first iteration of the refine (larger means less jobs) +#********************************************************************* +# Compilation flag. +#********************************************************************* + -O = global_flag ! fortran optimization flag use for the all code. + --fast-math = aloha_flag ! fortran optimization flag for aloha function. Suggestions: '-ffast-math' + -O3 = matrix_flag ! fortran optimization flag for matrix.f function. Suggestions: '-O3' + 16 = vector_size ! size of fortran arrays allocated in the multi-event API for SIMD/GPU (VECSIZE_MEMMAX) + #********************************************************************* # Customization (custom cuts/scale/bias/...) * # list of files containing fortran function that overwrite default * @@ -142,3 +158,11 @@ # systematics = systematics_program ! none, systematics [python], SysCalc [depreceted, C++] ['--mur=0.5,1,2', '--muf=0.5,1,2', '--pdf=errorset'] = systematics_arguments ! see: https://cp3.irmp.ucl.ac.be/projects/madgraph/wiki/Systematics#Systematicspythonmodule + +#*********************************************************************** +# SIMD/GPU configuration for the CUDACPP plugin +#************************************************************************ + d = floating_type ! floating point precision: f (single), d (double), m (mixed: double for amplitudes, single for colors) + auto = avx_level ! SIMD vectorization level: none, sse4, avx2, 512y, 512z, auto + CPP = cudacpp_backend ! CUDACPP backend: FORTRAN, CPP, CUDA + diff --git a/epochX/cudacpp/gg_ttg.mad/Source/make_opts b/epochX/cudacpp/gg_ttg.mad/Source/make_opts index 6409e99a49..e4b87ee6ad 100644 --- a/epochX/cudacpp/gg_ttg.mad/Source/make_opts +++ b/epochX/cudacpp/gg_ttg.mad/Source/make_opts @@ -1,7 +1,6 @@ DEFAULT_CPP_COMPILER=g++ DEFAULT_F2PY_COMPILER=f2py3 DEFAULT_F_COMPILER=gfortran -#end_of_make_opts_variables GLOBAL_FLAG=-O3 -ffast-math -fbounds-check MACFLAG= MG5AMC_VERSION=SpecifiedByMG5aMCAtRunTime diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/banner.py b/epochX/cudacpp/gg_ttg.mad/bin/internal/banner.py index 3995ce8109..bd1517985f 100755 --- a/epochX/cudacpp/gg_ttg.mad/bin/internal/banner.py +++ b/epochX/cudacpp/gg_ttg.mad/bin/internal/banner.py @@ -2822,13 +2822,15 @@ def add_param(self, name, value, fortran_name=None, include=True, if fct_mod: self.fct_mod[name] = fct_mod - def read(self, finput, consistency=True, unknown_warning=True): + def read(self, finput, consistency=True, unknown_warning=True, **opt): """Read the input file, this can be a path to a file, a file object, a str with the content of the file.""" if isinstance(finput, str): if "\n" in finput: finput = finput.split('\n') + if 'path' in opt: + self.path = opt['path'] elif os.path.isfile(finput): self.path = finput finput = open(finput) @@ -3913,7 +3915,7 @@ def remove_all_cut(self): %(global_flag)s = global_flag ! fortran optimization flag use for the all code. %(aloha_flag)s = aloha_flag ! fortran optimization flag for aloha function. Suggestions: '-ffast-math' %(matrix_flag)s = matrix_flag ! fortran optimization flag for matrix.f function. Suggestions: '-O3' - %(vector_size)s = vector_size ! size designed for SIMD/OpenMP/GPU (number of events in lockstep) + %(vector_size)s = vector_size ! size of fortran arrays allocated in the multi-event API for SIMD/GPU (VECSIZE_MEMMAX) """ template_off = '# To see advanced option for Phase-Space optimization: type "update psoptim"' diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/gg_ttg.mad/bin/internal/launch_plugin.py index 0b849330ef..3b09713e12 100644 --- a/epochX/cudacpp/gg_ttg.mad/bin/internal/launch_plugin.py +++ b/epochX/cudacpp/gg_ttg.mad/bin/internal/launch_plugin.py @@ -16,22 +16,28 @@ import internal.misc as misc import internal.extended_cmd as extended_cmd import internal.banner as banner_mod + import internal.common_run_interface as common_run_interface else: import madgraph.interface.madevent_interface as madevent_interface import madgraph.various.misc as misc import madgraph.interface.extended_cmd as extended_cmd import madgraph.various.banner as banner_mod + import madgraph.interface.common_run_interface as common_run_interface class CPPMEInterface(madevent_interface.MadEventCmdShell): def compile(self, *args, **opts): """ """ import multiprocessing if not self.options['nb_core'] or self.options['nb_core'] == 'None': - self.options['nb_core'] = multiprocessing.cpu_count() - if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'): - import pathlib - import os - pjoin = os.path.join + self.options['nb_core'] = multiprocessing.cpu_count() + if 'cwd' in opts and os.path.basename(opts['cwd']) == 'Source': + path = pjoin(opts['cwd'], 'make_opts') + avx_level = self.run_card['avx_level'] if self.run_card['avx_level'] != 'auto' else '' + common_run_interface.CommonRunCmd.update_make_opts_full(path, + {'FPTYPE': self.run_card['floating_type'], + 'AVX': avx_level }) + misc.sprint('FPTYPE checked') + if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'): cudacpp_backend = self.run_card['cudacpp_backend'].upper() # the default value is defined in banner.py logger.info("Building madevent in madevent_interface.py with '%s' matrix elements"%cudacpp_backend) if cudacpp_backend == 'FORTRAN': @@ -46,30 +52,83 @@ def compile(self, *args, **opts): else: return misc.compile(nb_core=self.options['nb_core'], *args, **opts) +# Phase-Space Optimization ------------------------------------------------------------------------------------ +template_on = \ +"""#*********************************************************************** +# SIMD/GPU configuration for the CUDACPP plugin +#************************************************************************ + %(floating_type)s = floating_type ! floating point precision: f (single), d (double), m (mixed: double for amplitudes, single for colors) + %(avx_level)s = avx_level ! SIMD vectorization level: none, sse4, avx2, 512y, 512z, auto + %(cudacpp_backend)s = cudacpp_backend ! CUDACPP backend: FORTRAN, CPP, CUDA +""" + +template_off = '' +plugin_block = banner_mod.RunBlock('simd', template_on=template_on, template_off=template_off) + class CPPRunCard(banner_mod.RunCardLO): + blocks = banner_mod.RunCardLO.blocks + [plugin_block] + def reset_simd(self, old_value, new_value, name): if not hasattr(self, 'path'): - logger.warning('WARNING! CPPRunCard instance has no attribute path') - return - ###raise Exception('INTERNAL ERROR! CPPRunCard instance has no attribute path') + raise Exception('INTERNAL ERROR! CPPRunCard instance has no attribute path') # now ok after fixing #790 if name == "vector_size" and new_value <= int(old_value): # code can handle the new size -> do not recompile return Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source') subprocess.call(['make', 'cleanavx'], cwd=Sourcedir, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) + def reset_makeopts(self, old_value, new_value, name): + if not hasattr(self, 'path'): + raise Exception + avx_value = self['avx_level'] if self['avx_level'] != 'auto' else '' + if name == 'floating_type': + common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': new_value, 'AVX': avx_value}) + elif name == 'avx_level': + if new_value == 'auto': new_value = '' + common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': self['floating_type'], 'AVX': new_value}) + else: + raise Exception + Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source') + subprocess.call(['make', 'cleanavx'], cwd=Sourcedir, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) + def plugin_input(self, finput): return def default_setup(self): super().default_setup() - self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False) + self.add_param('floating_type', 'd', include=False, hidden=False, + fct_mod=(self.reset_makeopts,(),{}), + allowed=['m','d','f']) + self.add_param('avx_level', 'auto', include=False, hidden=False, + fct_mod=(self.reset_makeopts,(),{}), + allowed=['auto', 'none', 'sse4', 'avx2','512y','512z']) + self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False, + allowed=['Fortan', 'CPP', 'CUDA']) + self['vector_size'] = 16 # already setup in default class (just change value) + self['aloha_flag'] = '--fast-math' + self['matrix_flag'] = '-O3' + self.display_block.append('simd') + self.display_block.append('psoptim') + # OM/AV - overload the default version in banner.py def write_one_include_file(self, output_dir, incname, output_file=None): """write one include file at the time""" - if incname == "vector.inc" and 'vector_size' not in self.user_set: - return - super().write_one_include_file(output_dir, incname, output_file) + if incname == "vector.inc": + if 'vector_size' not in self.user_set: return + if output_file is None: vectorinc=pjoin(output_dir,incname) + else: vectorinc=output_file + with open(vectorinc+'.new','w') as fileout: + with open(vectorinc) as filein: + for line in filein: + if line.startswith('C'): fileout.write(line) + super().write_one_include_file(output_dir, incname, output_file) + with open(vectorinc+'.new','a') as fileout: + with open(vectorinc) as filein: + for line in filein: + if not line.startswith('\n'): fileout.write(line) + os.replace(vectorinc+'.new',vectorinc) + else: + super().write_one_include_file(output_dir, incname, output_file) def check_validity(self): """ensure that PLUGIN information are consistent""" @@ -82,13 +141,10 @@ def check_validity(self): class GPURunCard(CPPRunCard): def default_setup(self): - super(CPPRunCard, self).default_setup() - self.add_param('cudacpp_backend', 'CUDA', include=False, hidden=False) - -#class CUDACPPRunCard(CPPRunCard): -# def default_setup(self): -# super(CPPRunCard, self).default_setup() -# self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False) + super().default_setup() + # change default value: + self['cudacpp_backend'] = 'CUDA' + self['vector_size'] = 16384 # already setup in default class (just change value) MEINTERFACE = CPPMEInterface RunCard = CPPRunCard diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/madevent_interface.py b/epochX/cudacpp/gg_ttg.mad/bin/internal/madevent_interface.py index 853aabc98a..cb6bf4ca57 100755 --- a/epochX/cudacpp/gg_ttg.mad/bin/internal/madevent_interface.py +++ b/epochX/cudacpp/gg_ttg.mad/bin/internal/madevent_interface.py @@ -3191,7 +3191,7 @@ def do_treatcards(self, line, mode=None, opt=None): if mode in ['run', 'all']: if not hasattr(self, 'run_card'): - run_card = banner_mod.RunCard(opt['run_card']) + run_card = banner_mod.RunCard(opt['run_card'], path=pjoin(self.me_dir, 'Cards', 'run_card.dat')) else: run_card = self.run_card self.run_card = run_card diff --git a/epochX/cudacpp/gg_ttg.mad/mg5.in b/epochX/cudacpp/gg_ttg.mad/mg5.in index 98f53ce50d..e37d10d865 100644 --- a/epochX/cudacpp/gg_ttg.mad/mg5.in +++ b/epochX/cudacpp/gg_ttg.mad/mg5.in @@ -1,4 +1,4 @@ set stdout_level DEBUG set zerowidth_tchannel F generate g g > t t~ g -output madevent gg_ttg.mad --hel_recycling=False --vector_size=16384 --me_exporter=standalone_cudacpp +output madevent gg_ttg.mad --hel_recycling=False --vector_size=32 --me_exporter=standalone_cudacpp diff --git a/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt b/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt index 97056958fe..b9716683be 100644 --- a/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt +++ b/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005817890167236328  +DEBUG: model prefixing takes 0.005490303039550781  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -155,35 +155,35 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=3: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g WEIGHTED<=3 @1 INFO: Process has 16 diagrams -1 processes with 16 diagrams generated in 0.022 s +1 processes with 16 diagrams generated in 0.021 s Total: 1 processes with 16 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_ttg Load PLUGIN.CUDACPP_OUTPUT Output will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: cformat =  plugin [export_cpp.py at line 3071]  -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 155]  -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 160]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 160]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 165]  INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ g WEIGHTED<=3 @1 INFO: Processing color information for process: g g > t t~ g @1 -DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 189]  -DEBUG: type(subproc_group)= [output.py at line 190]  -DEBUG: type(fortran_model)= [output.py at line 191]  -DEBUG: type(me)= me=0 [output.py at line 192]  +DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 194]  +DEBUG: type(subproc_group)= [output.py at line 195]  +DEBUG: type(fortran_model)= [output.py at line 196]  +DEBUG: type(me)= me=0 [output.py at line 197]  INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/./CPPProcess.h FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/./CPPProcess.cc INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/. -Generated helas calls for 1 subprocesses (16 diagrams) in 0.038 s -DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 197]  +Generated helas calls for 1 subprocesses (16 diagrams) in 0.036 s +DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 5 routines in 0.326 s +ALOHA: aloha creates 5 routines in 0.322 s VVV1 VVV1 FFV1 @@ -201,9 +201,8 @@ FileWriter for / FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/. -DEBUG: 'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) =  Entering PLUGIN_ProcessExporter.finalize False [output.py at line 206]  quit -real 0m3.779s -user 0m0.713s -sys 0m0.062s +real 0m0.784s +user 0m0.699s +sys 0m0.063s diff --git a/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt b/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt index eacd7a356a..ee3d38dfb1 100644 --- a/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt +++ b/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ g g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.0053293704986572266  +DEBUG: model prefixing takes 0.00561833381652832  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -155,17 +155,17 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=4: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g g WEIGHTED<=4 @1 INFO: Process has 123 diagrams -1 processes with 123 diagrams generated in 0.157 s +1 processes with 123 diagrams generated in 0.154 s Total: 1 processes with 123 diagrams -output madevent ../TMPOUT/CODEGEN_mad_gg_ttgg --hel_recycling=False --vector_size=16384 --me_exporter=standalone_cudacpp +output madevent ../TMPOUT/CODEGEN_mad_gg_ttgg --hel_recycling=False --vector_size=32 --me_exporter=standalone_cudacpp Load PLUGIN.CUDACPP_OUTPUT Addition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT Output will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: cformat =  standalone_cudacpp [export_cpp.py at line 3071]  -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 155]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 160]  INFO: initialize a new directory: CODEGEN_mad_gg_ttgg INFO: remove old information in CODEGEN_mad_gg_ttgg -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 160]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 165]  WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg  INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards  @@ -175,7 +175,7 @@ INFO: Generating Helas calls for process: g g > t t~ g g WEIGHTED<=4 @1 INFO: Processing color information for process: g g > t t~ g g @1 INFO: Creating files in directory P1_gg_ttxgg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -184,29 +184,29 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: config_map =  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 0, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 0, 46, 47, 48, 49, 50, 51, 52, 53, 54, 0, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 0, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 0, 88, 89, 90, 91, 92, 93, 0, 94, 95, 96, 97, 98, 99, 0, 100, 101, 102, 103, 104, 105, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] [export_cpp.py at line 711]  DEBUG: subproc_number =  0 [export_cpp.py at line 712]  DEBUG: Done [export_cpp.py at line 713]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872]  INFO: Generating Feynman diagrams for Process: g g > t t~ g g WEIGHTED<=4 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttxgg -Generated helas calls for 1 subprocesses (123 diagrams) in 0.425 s -Wrote files for 222 helas calls in 0.691 s +Generated helas calls for 1 subprocesses (123 diagrams) in 0.420 s +Wrote files for 222 helas calls in 0.679 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.333 s -DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 197]  +ALOHA: aloha creates 5 routines in 0.323 s +DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 10 routines in 0.316 s +ALOHA: aloha creates 10 routines in 0.314 s VVV1 VVV1 FFV1 @@ -233,7 +233,6 @@ save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CO INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate web pages -DEBUG: 'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) =  Entering PLUGIN_ProcessExporter.finalize True [output.py at line 206]  DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common patching file Source/genps.inc patching file Source/makefile @@ -249,15 +248,15 @@ Hunk #2 succeeded at 191 (offset 48 lines). Hunk #3 succeeded at 269 (offset 48 lines). Hunk #4 succeeded at 297 (offset 48 lines). Hunk #5 succeeded at 342 (offset 48 lines). -DEBUG: p.returncode =  0 [output.py at line 232]  +DEBUG: p.returncode =  0 [output.py at line 237]  Output to directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg done. Type "launch" to generate events from this process, or see /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/README Run "open index.html" to see more information about this process. quit -real 0m6.262s -user 0m3.028s +real 0m3.226s +user 0m2.976s sys 0m0.232s ************************************************************ * * @@ -286,9 +285,6 @@ Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt No valid web browser found. Please set in ./input/mg5_configuration.txt treatcards run -run_card missed argument cudacpp_backend. Takes default: CPP -run_card missed argument cudacpp_backend. Takes default: CPP -WARNING! CPPRunCard instance has no attribute path quit INFO: launch in debug mode diff --git a/epochX/cudacpp/gg_ttgg.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/gg_ttgg.mad/Cards/proc_card_mg5.dat index b7568d1a73..ea9cfcde68 100644 --- a/epochX/cudacpp/gg_ttgg.mad/Cards/proc_card_mg5.dat +++ b/epochX/cudacpp/gg_ttgg.mad/Cards/proc_card_mg5.dat @@ -46,4 +46,4 @@ define l- = e- mu- define vl = ve vm vt define vl~ = ve~ vm~ vt~ output madevent ../TMPOUT/CODEGEN_mad_gg_ttgg --hel_recycling=False --\ -vector_size=16384 --me_exporter=standalone_cudacpp +vector_size=32 --me_exporter=standalone_cudacpp diff --git a/epochX/cudacpp/gg_ttgg.mad/Cards/run_card.dat b/epochX/cudacpp/gg_ttgg.mad/Cards/run_card.dat index 88d4377d71..42728a48f3 100644 --- a/epochX/cudacpp/gg_ttgg.mad/Cards/run_card.dat +++ b/epochX/cudacpp/gg_ttgg.mad/Cards/run_card.dat @@ -80,7 +80,23 @@ 1 = sde_strategy ! default integration strategy (hep-ph/2021.00773) ! 1 is old strategy (using amp square) ! 2 is new strategy (using only the denominator) -# To see advanced option for Phase-Space optimization: type "update psoptim" +#********************************************************************* +# Phase-Space Optim (advanced) +#********************************************************************* + 0 = job_strategy ! see appendix of 1507.00020 (page 26) + 0 = hard_survey ! force to have better estimate of the integral at survey for difficult mode like interference + -1.0 = tmin_for_channel ! limit the non-singular reach of --some-- channel of integration related to T-channel diagram (value between -1 and 0), -1 is no impact + -1 = survey_splitting ! for loop-induced control how many core are used at survey for the computation of a single iteration. + 2 = survey_nchannel_per_job ! control how many Channel are integrated inside a single job on cluster/multicore + -1 = refine_evt_by_job ! control the maximal number of events for the first iteration of the refine (larger means less jobs) +#********************************************************************* +# Compilation flag. +#********************************************************************* + -O3 -ffast-math -fbounds-check = global_flag ! build flags for all Fortran code (for a fair comparison to cudacpp; default is -O) + --fast-math = aloha_flag ! fortran optimization flag for aloha function. Suggestions: '-ffast-math' + -O3 = matrix_flag ! fortran optimization flag for matrix.f function. Suggestions: '-O3' + 16384 = vector_size ! size of fortran arrays allocated in the multi-event API for SIMD/GPU (VECSIZE_MEMMAX) + #********************************************************************* # Customization (custom cuts/scale/bias/...) * # list of files containing fortran function that overwrite default * @@ -168,12 +184,10 @@ systematics = systematics_program ! none, systematics [python], SysCalc [depreceted, C++] ['--mur=0.5,1,2', '--muf=0.5,1,2', '--pdf=errorset'] = systematics_arguments ! see: https://cp3.irmp.ucl.ac.be/projects/madgraph/wiki/Systematics#Systematicspythonmodule -#********************************************************************* -# Options for the cudacpp plugin -#********************************************************************* - -# Set cudacpp-specific values of non-cudacpp-specific options --O3 -ffast-math -fbounds-check = global_flag ! build flags for Fortran code (for a fair comparison to cudacpp) +#*********************************************************************** +# SIMD/GPU configuration for the CUDACPP plugin +#************************************************************************ + d = floating_type ! floating point precision: f (single), d (double), m (mixed: double for amplitudes, single for colors) + auto = avx_level ! SIMD vectorization level: none, sse4, avx2, 512y, 512z, auto + CPP = cudacpp_backend ! CUDACPP backend: FORTRAN, CPP, CUDA -# New cudacpp-specific options (default values are defined in banner.py) -CPP = cudacpp_backend ! valid backends are FORTRAN, CPP, CUDA diff --git a/epochX/cudacpp/gg_ttgg.mad/Cards/run_card_default.dat b/epochX/cudacpp/gg_ttgg.mad/Cards/run_card_default.dat index 26fa7f39dd..0374d7ba60 100644 --- a/epochX/cudacpp/gg_ttgg.mad/Cards/run_card_default.dat +++ b/epochX/cudacpp/gg_ttgg.mad/Cards/run_card_default.dat @@ -80,7 +80,23 @@ 2 = sde_strategy ! default integration strategy (hep-ph/2021.00773) ! 1 is old strategy (using amp square) ! 2 is new strategy (using only the denominator) -# To see advanced option for Phase-Space optimization: type "update psoptim" +#********************************************************************* +# Phase-Space Optim (advanced) +#********************************************************************* + 0 = job_strategy ! see appendix of 1507.00020 (page 26) + 0 = hard_survey ! force to have better estimate of the integral at survey for difficult mode like interference + -1.0 = tmin_for_channel ! limit the non-singular reach of --some-- channel of integration related to T-channel diagram (value between -1 and 0), -1 is no impact + -1 = survey_splitting ! for loop-induced control how many core are used at survey for the computation of a single iteration. + 2 = survey_nchannel_per_job ! control how many Channel are integrated inside a single job on cluster/multicore + -1 = refine_evt_by_job ! control the maximal number of events for the first iteration of the refine (larger means less jobs) +#********************************************************************* +# Compilation flag. +#********************************************************************* + -O = global_flag ! fortran optimization flag use for the all code. + --fast-math = aloha_flag ! fortran optimization flag for aloha function. Suggestions: '-ffast-math' + -O3 = matrix_flag ! fortran optimization flag for matrix.f function. Suggestions: '-O3' + 16 = vector_size ! size of fortran arrays allocated in the multi-event API for SIMD/GPU (VECSIZE_MEMMAX) + #********************************************************************* # Customization (custom cuts/scale/bias/...) * # list of files containing fortran function that overwrite default * @@ -167,3 +183,11 @@ # systematics = systematics_program ! none, systematics [python], SysCalc [depreceted, C++] ['--mur=0.5,1,2', '--muf=0.5,1,2', '--pdf=errorset'] = systematics_arguments ! see: https://cp3.irmp.ucl.ac.be/projects/madgraph/wiki/Systematics#Systematicspythonmodule + +#*********************************************************************** +# SIMD/GPU configuration for the CUDACPP plugin +#************************************************************************ + d = floating_type ! floating point precision: f (single), d (double), m (mixed: double for amplitudes, single for colors) + auto = avx_level ! SIMD vectorization level: none, sse4, avx2, 512y, 512z, auto + CPP = cudacpp_backend ! CUDACPP backend: FORTRAN, CPP, CUDA + diff --git a/epochX/cudacpp/gg_ttgg.mad/Source/make_opts b/epochX/cudacpp/gg_ttgg.mad/Source/make_opts index 6409e99a49..e4b87ee6ad 100644 --- a/epochX/cudacpp/gg_ttgg.mad/Source/make_opts +++ b/epochX/cudacpp/gg_ttgg.mad/Source/make_opts @@ -1,7 +1,6 @@ DEFAULT_CPP_COMPILER=g++ DEFAULT_F2PY_COMPILER=f2py3 DEFAULT_F_COMPILER=gfortran -#end_of_make_opts_variables GLOBAL_FLAG=-O3 -ffast-math -fbounds-check MACFLAG= MG5AMC_VERSION=SpecifiedByMG5aMCAtRunTime diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/banner.py b/epochX/cudacpp/gg_ttgg.mad/bin/internal/banner.py index 3995ce8109..bd1517985f 100755 --- a/epochX/cudacpp/gg_ttgg.mad/bin/internal/banner.py +++ b/epochX/cudacpp/gg_ttgg.mad/bin/internal/banner.py @@ -2822,13 +2822,15 @@ def add_param(self, name, value, fortran_name=None, include=True, if fct_mod: self.fct_mod[name] = fct_mod - def read(self, finput, consistency=True, unknown_warning=True): + def read(self, finput, consistency=True, unknown_warning=True, **opt): """Read the input file, this can be a path to a file, a file object, a str with the content of the file.""" if isinstance(finput, str): if "\n" in finput: finput = finput.split('\n') + if 'path' in opt: + self.path = opt['path'] elif os.path.isfile(finput): self.path = finput finput = open(finput) @@ -3913,7 +3915,7 @@ def remove_all_cut(self): %(global_flag)s = global_flag ! fortran optimization flag use for the all code. %(aloha_flag)s = aloha_flag ! fortran optimization flag for aloha function. Suggestions: '-ffast-math' %(matrix_flag)s = matrix_flag ! fortran optimization flag for matrix.f function. Suggestions: '-O3' - %(vector_size)s = vector_size ! size designed for SIMD/OpenMP/GPU (number of events in lockstep) + %(vector_size)s = vector_size ! size of fortran arrays allocated in the multi-event API for SIMD/GPU (VECSIZE_MEMMAX) """ template_off = '# To see advanced option for Phase-Space optimization: type "update psoptim"' diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/gg_ttgg.mad/bin/internal/launch_plugin.py index 0b849330ef..3b09713e12 100644 --- a/epochX/cudacpp/gg_ttgg.mad/bin/internal/launch_plugin.py +++ b/epochX/cudacpp/gg_ttgg.mad/bin/internal/launch_plugin.py @@ -16,22 +16,28 @@ import internal.misc as misc import internal.extended_cmd as extended_cmd import internal.banner as banner_mod + import internal.common_run_interface as common_run_interface else: import madgraph.interface.madevent_interface as madevent_interface import madgraph.various.misc as misc import madgraph.interface.extended_cmd as extended_cmd import madgraph.various.banner as banner_mod + import madgraph.interface.common_run_interface as common_run_interface class CPPMEInterface(madevent_interface.MadEventCmdShell): def compile(self, *args, **opts): """ """ import multiprocessing if not self.options['nb_core'] or self.options['nb_core'] == 'None': - self.options['nb_core'] = multiprocessing.cpu_count() - if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'): - import pathlib - import os - pjoin = os.path.join + self.options['nb_core'] = multiprocessing.cpu_count() + if 'cwd' in opts and os.path.basename(opts['cwd']) == 'Source': + path = pjoin(opts['cwd'], 'make_opts') + avx_level = self.run_card['avx_level'] if self.run_card['avx_level'] != 'auto' else '' + common_run_interface.CommonRunCmd.update_make_opts_full(path, + {'FPTYPE': self.run_card['floating_type'], + 'AVX': avx_level }) + misc.sprint('FPTYPE checked') + if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'): cudacpp_backend = self.run_card['cudacpp_backend'].upper() # the default value is defined in banner.py logger.info("Building madevent in madevent_interface.py with '%s' matrix elements"%cudacpp_backend) if cudacpp_backend == 'FORTRAN': @@ -46,30 +52,83 @@ def compile(self, *args, **opts): else: return misc.compile(nb_core=self.options['nb_core'], *args, **opts) +# Phase-Space Optimization ------------------------------------------------------------------------------------ +template_on = \ +"""#*********************************************************************** +# SIMD/GPU configuration for the CUDACPP plugin +#************************************************************************ + %(floating_type)s = floating_type ! floating point precision: f (single), d (double), m (mixed: double for amplitudes, single for colors) + %(avx_level)s = avx_level ! SIMD vectorization level: none, sse4, avx2, 512y, 512z, auto + %(cudacpp_backend)s = cudacpp_backend ! CUDACPP backend: FORTRAN, CPP, CUDA +""" + +template_off = '' +plugin_block = banner_mod.RunBlock('simd', template_on=template_on, template_off=template_off) + class CPPRunCard(banner_mod.RunCardLO): + blocks = banner_mod.RunCardLO.blocks + [plugin_block] + def reset_simd(self, old_value, new_value, name): if not hasattr(self, 'path'): - logger.warning('WARNING! CPPRunCard instance has no attribute path') - return - ###raise Exception('INTERNAL ERROR! CPPRunCard instance has no attribute path') + raise Exception('INTERNAL ERROR! CPPRunCard instance has no attribute path') # now ok after fixing #790 if name == "vector_size" and new_value <= int(old_value): # code can handle the new size -> do not recompile return Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source') subprocess.call(['make', 'cleanavx'], cwd=Sourcedir, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) + def reset_makeopts(self, old_value, new_value, name): + if not hasattr(self, 'path'): + raise Exception + avx_value = self['avx_level'] if self['avx_level'] != 'auto' else '' + if name == 'floating_type': + common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': new_value, 'AVX': avx_value}) + elif name == 'avx_level': + if new_value == 'auto': new_value = '' + common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': self['floating_type'], 'AVX': new_value}) + else: + raise Exception + Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source') + subprocess.call(['make', 'cleanavx'], cwd=Sourcedir, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) + def plugin_input(self, finput): return def default_setup(self): super().default_setup() - self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False) + self.add_param('floating_type', 'd', include=False, hidden=False, + fct_mod=(self.reset_makeopts,(),{}), + allowed=['m','d','f']) + self.add_param('avx_level', 'auto', include=False, hidden=False, + fct_mod=(self.reset_makeopts,(),{}), + allowed=['auto', 'none', 'sse4', 'avx2','512y','512z']) + self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False, + allowed=['Fortan', 'CPP', 'CUDA']) + self['vector_size'] = 16 # already setup in default class (just change value) + self['aloha_flag'] = '--fast-math' + self['matrix_flag'] = '-O3' + self.display_block.append('simd') + self.display_block.append('psoptim') + # OM/AV - overload the default version in banner.py def write_one_include_file(self, output_dir, incname, output_file=None): """write one include file at the time""" - if incname == "vector.inc" and 'vector_size' not in self.user_set: - return - super().write_one_include_file(output_dir, incname, output_file) + if incname == "vector.inc": + if 'vector_size' not in self.user_set: return + if output_file is None: vectorinc=pjoin(output_dir,incname) + else: vectorinc=output_file + with open(vectorinc+'.new','w') as fileout: + with open(vectorinc) as filein: + for line in filein: + if line.startswith('C'): fileout.write(line) + super().write_one_include_file(output_dir, incname, output_file) + with open(vectorinc+'.new','a') as fileout: + with open(vectorinc) as filein: + for line in filein: + if not line.startswith('\n'): fileout.write(line) + os.replace(vectorinc+'.new',vectorinc) + else: + super().write_one_include_file(output_dir, incname, output_file) def check_validity(self): """ensure that PLUGIN information are consistent""" @@ -82,13 +141,10 @@ def check_validity(self): class GPURunCard(CPPRunCard): def default_setup(self): - super(CPPRunCard, self).default_setup() - self.add_param('cudacpp_backend', 'CUDA', include=False, hidden=False) - -#class CUDACPPRunCard(CPPRunCard): -# def default_setup(self): -# super(CPPRunCard, self).default_setup() -# self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False) + super().default_setup() + # change default value: + self['cudacpp_backend'] = 'CUDA' + self['vector_size'] = 16384 # already setup in default class (just change value) MEINTERFACE = CPPMEInterface RunCard = CPPRunCard diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/madevent_interface.py b/epochX/cudacpp/gg_ttgg.mad/bin/internal/madevent_interface.py index 853aabc98a..cb6bf4ca57 100755 --- a/epochX/cudacpp/gg_ttgg.mad/bin/internal/madevent_interface.py +++ b/epochX/cudacpp/gg_ttgg.mad/bin/internal/madevent_interface.py @@ -3191,7 +3191,7 @@ def do_treatcards(self, line, mode=None, opt=None): if mode in ['run', 'all']: if not hasattr(self, 'run_card'): - run_card = banner_mod.RunCard(opt['run_card']) + run_card = banner_mod.RunCard(opt['run_card'], path=pjoin(self.me_dir, 'Cards', 'run_card.dat')) else: run_card = self.run_card self.run_card = run_card diff --git a/epochX/cudacpp/gg_ttgg.mad/mg5.in b/epochX/cudacpp/gg_ttgg.mad/mg5.in index e2c5858b63..53784bf161 100644 --- a/epochX/cudacpp/gg_ttgg.mad/mg5.in +++ b/epochX/cudacpp/gg_ttgg.mad/mg5.in @@ -1,4 +1,4 @@ set stdout_level DEBUG set zerowidth_tchannel F generate g g > t t~ g g -output madevent gg_ttgg.mad --hel_recycling=False --vector_size=16384 --me_exporter=standalone_cudacpp +output madevent gg_ttgg.mad --hel_recycling=False --vector_size=32 --me_exporter=standalone_cudacpp diff --git a/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt b/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt index 80631c94bf..d62aabc436 100644 --- a/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt +++ b/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ g g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.00567317008972168  +DEBUG: model prefixing takes 0.005473136901855469  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -161,29 +161,29 @@ output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_ttgg Load PLUGIN.CUDACPP_OUTPUT Output will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: cformat =  plugin [export_cpp.py at line 3071]  -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 155]  -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 160]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 160]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 165]  INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ g g WEIGHTED<=4 @1 INFO: Processing color information for process: g g > t t~ g g @1 -DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 189]  -DEBUG: type(subproc_group)= [output.py at line 190]  -DEBUG: type(fortran_model)= [output.py at line 191]  -DEBUG: type(me)= me=0 [output.py at line 192]  +DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 194]  +DEBUG: type(subproc_group)= [output.py at line 195]  +DEBUG: type(fortran_model)= [output.py at line 196]  +DEBUG: type(me)= me=0 [output.py at line 197]  INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/./CPPProcess.h FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/./CPPProcess.cc INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/. -Generated helas calls for 1 subprocesses (123 diagrams) in 0.423 s -DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 197]  +Generated helas calls for 1 subprocesses (123 diagrams) in 0.420 s +DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.318 s +ALOHA: aloha creates 5 routines in 0.894 s VVV1 VVV1 FFV1 @@ -204,9 +204,8 @@ FileWriter for / FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/. -DEBUG: 'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) =  Entering PLUGIN_ProcessExporter.finalize False [output.py at line 206]  quit -real 0m4.435s -user 0m1.373s +real 0m2.023s +user 0m1.361s sys 0m0.056s diff --git a/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt b/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt index ab3974344c..d94e7252af 100644 --- a/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt +++ b/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ g g g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005319833755493164  +DEBUG: model prefixing takes 0.005348920822143555  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -155,17 +155,17 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=5: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g g g WEIGHTED<=5 @1 INFO: Process has 1240 diagrams -1 processes with 1240 diagrams generated in 1.855 s +1 processes with 1240 diagrams generated in 1.857 s Total: 1 processes with 1240 diagrams -output madevent ../TMPOUT/CODEGEN_mad_gg_ttggg --hel_recycling=False --vector_size=16384 --me_exporter=standalone_cudacpp +output madevent ../TMPOUT/CODEGEN_mad_gg_ttggg --hel_recycling=False --vector_size=32 --me_exporter=standalone_cudacpp Load PLUGIN.CUDACPP_OUTPUT Addition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT Output will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: cformat =  standalone_cudacpp [export_cpp.py at line 3071]  -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 155]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 160]  INFO: initialize a new directory: CODEGEN_mad_gg_ttggg INFO: remove old information in CODEGEN_mad_gg_ttggg -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 160]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 165]  WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg  INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards  @@ -175,9 +175,9 @@ INFO: Generating Helas calls for process: g g > t t~ g g g WEIGHTED<=5 @1 INFO: Processing color information for process: g g > t t~ g g g @1 INFO: Creating files in directory P1_gg_ttxggg INFO: Computing Color-Flow optimization [15120 term] -INFO: Color-Flow passed to 1630 term in 8s. Introduce 3030 contraction +INFO: Color-Flow passed to 1630 term in 7s. Introduce 3030 contraction DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -186,29 +186,29 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: config_map =  [1, 2, 0, 3, 4, 0, 5, 6, 0, 0, 0, 0, 0, 7, 8, 9, 0, 10, 11, 12, 0, 13, 14, 15, 0, 16, 17, 18, 19, 20, 21, 0, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 0, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 0, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 0, 67, 68, 69, 70, 71, 72, 73, 74, 75, 0, 76, 77, 78, 79, 80, 81, 82, 83, 84, 0, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 0, 0, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 0, 121, 122, 0, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 0, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 0, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 0, 197, 198, 199, 200, 201, 202, 0, 203, 204, 205, 206, 207, 208, 0, 209, 210, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 0, 226, 227, 0, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 0, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 0, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 0, 302, 303, 304, 305, 306, 307, 0, 308, 309, 310, 311, 312, 313, 0, 314, 315, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 316, 317, 318, 319, 320, 321, 0, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 0, 337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 0, 352, 353, 354, 355, 356, 357, 358, 359, 360, 361, 362, 363, 364, 365, 366, 0, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 0, 378, 379, 0, 380, 381, 0, 0, 0, 0, 0, 382, 383, 384, 385, 386, 387, 388, 389, 390, 0, 391, 392, 393, 394, 395, 396, 397, 398, 399, 0, 400, 401, 402, 403, 404, 405, 406, 407, 408, 0, 409, 410, 411, 412, 413, 414, 0, 415, 416, 417, 418, 419, 420, 0, 0, 0, 421, 422, 423, 424, 425, 426, 0, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, 0, 442, 443, 444, 445, 446, 447, 448, 449, 450, 451, 452, 453, 454, 455, 456, 0, 457, 458, 459, 460, 461, 462, 463, 464, 465, 466, 467, 468, 469, 470, 471, 0, 472, 473, 474, 475, 476, 477, 478, 479, 480, 481, 482, 0, 483, 484, 0, 485, 486, 0, 0, 0, 0, 0, 487, 488, 489, 490, 491, 492, 493, 494, 495, 0, 496, 497, 498, 499, 500, 501, 502, 503, 504, 0, 505, 506, 507, 508, 509, 510, 511, 512, 513, 0, 514, 515, 516, 517, 518, 519, 0, 520, 521, 522, 523, 524, 525, 0, 0, 0, 526, 527, 528, 529, 530, 531, 0, 532, 533, 534, 535, 536, 537, 538, 539, 540, 541, 542, 543, 544, 545, 546, 0, 547, 548, 549, 550, 551, 552, 553, 554, 555, 556, 557, 558, 559, 560, 561, 0, 562, 563, 564, 565, 566, 567, 568, 569, 570, 571, 572, 573, 574, 575, 576, 0, 577, 578, 579, 580, 581, 582, 583, 584, 585, 586, 587, 0, 588, 589, 0, 590, 591, 0, 0, 0, 0, 0, 592, 593, 594, 595, 596, 597, 598, 599, 600, 0, 601, 602, 603, 604, 605, 606, 607, 608, 609, 0, 610, 611, 612, 613, 614, 615, 616, 617, 618, 0, 619, 620, 621, 622, 623, 624, 0, 625, 626, 627, 628, 629, 630, 0, 0, 0, 631, 632, 633, 634, 635, 636, 637, 638, 639, 640, 641, 642, 643, 644, 645, 646, 647, 648, 649, 650, 651, 652, 653, 654, 655, 656, 657, 658, 659, 660, 661, 662, 663, 0, 664, 665, 666, 667, 668, 669, 0, 670, 671, 672, 673, 674, 675, 0, 0, 0, 676, 677, 678, 679, 680, 681, 682, 683, 684, 685, 686, 687, 688, 689, 690, 691, 692, 693, 694, 695, 696, 697, 698, 699, 700, 701, 702, 703, 704, 705, 706, 707, 708, 0, 709, 710, 711, 712, 713, 714, 0, 715, 716, 717, 718, 719, 720, 0, 0, 0, 721, 722, 0, 723, 724, 0, 725, 726, 0, 0, 0, 0, 0, 727, 728, 729, 730, 731, 732, 733, 734, 735, 0, 736, 737, 738, 739, 740, 741, 742, 743, 744, 0, 745, 746, 747, 748, 749, 750, 751, 752, 753, 0, 754, 755, 756, 757, 758, 759, 0, 760, 761, 762, 763, 764, 765, 766, 767, 0, 768, 769, 0, 770, 771, 0, 0, 0, 0, 0, 772, 773, 774, 775, 776, 777, 778, 779, 780, 0, 781, 782, 783, 784, 785, 786, 787, 788, 789, 0, 790, 791, 792, 793, 794, 795, 796, 797, 798, 0, 799, 800, 801, 802, 803, 804, 0, 805, 806, 807, 808, 809, 810, 811, 812, 0, 813, 814, 0, 815, 816, 0, 0, 0, 0, 0, 817, 818, 819, 820, 821, 822, 823, 824, 825, 0, 826, 827, 828, 829, 830, 831, 832, 833, 834, 0, 835, 836, 837, 838, 839, 840, 841, 842, 843, 0, 844, 845, 846, 847, 848, 849, 0, 850, 851, 852, 853, 854, 855, 856, 857, 0, 858, 859, 0, 860, 861, 0, 0, 0, 0, 862, 863, 0, 864, 865, 0, 866, 867, 0, 0, 0, 0, 868, 869, 0, 870, 871, 0, 872, 873, 0, 0, 0, 0, 0, 0, 0, 874, 875, 876, 877, 878, 879, 880, 881, 882, 883, 884, 885, 886, 887, 888, 889, 890, 891, 0, 892, 893, 894, 895, 896, 897, 898, 899, 900, 901, 902, 903, 904, 905, 906, 907, 908, 909, 0, 910, 911, 912, 913, 914, 915, 916, 917, 918, 919, 920, 921, 922, 923, 924, 925, 926, 927, 0, 928, 929, 930, 931, 932, 933, 0, 934, 935, 936, 937, 938, 939, 0, 940, 941, 942, 943, 944, 945, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] [export_cpp.py at line 711]  DEBUG: subproc_number =  0 [export_cpp.py at line 712]  DEBUG: Done [export_cpp.py at line 713]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872]  INFO: Generating Feynman diagrams for Process: g g > t t~ g g g WEIGHTED<=5 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttxggg -Generated helas calls for 1 subprocesses (1240 diagrams) in 6.574 s -Wrote files for 2281 helas calls in 18.431 s +Generated helas calls for 1 subprocesses (1240 diagrams) in 6.607 s +Wrote files for 2281 helas calls in 18.169 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.335 s -DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 197]  +ALOHA: aloha creates 5 routines in 0.317 s +DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 10 routines in 0.313 s +ALOHA: aloha creates 10 routines in 0.308 s VVV1 VVV1 FFV1 @@ -235,7 +235,6 @@ save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CO INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate web pages -DEBUG: 'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) =  Entering PLUGIN_ProcessExporter.finalize True [output.py at line 206]  DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common patching file Source/genps.inc patching file Source/makefile @@ -251,16 +250,16 @@ Hunk #2 succeeded at 255 (offset 112 lines). Hunk #3 succeeded at 333 (offset 112 lines). Hunk #4 succeeded at 361 (offset 112 lines). Hunk #5 succeeded at 406 (offset 112 lines). -DEBUG: p.returncode =  0 [output.py at line 232]  +DEBUG: p.returncode =  0 [output.py at line 237]  Output to directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg done. Type "launch" to generate events from this process, or see /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/README Run "open index.html" to see more information about this process. quit -real 0m32.103s -user 0m28.586s -sys 0m0.412s +real 0m28.894s +user 0m28.357s +sys 0m0.382s ************************************************************ * * * W E L C O M E to * @@ -288,9 +287,6 @@ Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt No valid web browser found. Please set in ./input/mg5_configuration.txt treatcards run -run_card missed argument cudacpp_backend. Takes default: CPP -run_card missed argument cudacpp_backend. Takes default: CPP -WARNING! CPPRunCard instance has no attribute path quit INFO: launch in debug mode diff --git a/epochX/cudacpp/gg_ttggg.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/gg_ttggg.mad/Cards/proc_card_mg5.dat index 2f92ecc4ba..3923568dd8 100644 --- a/epochX/cudacpp/gg_ttggg.mad/Cards/proc_card_mg5.dat +++ b/epochX/cudacpp/gg_ttggg.mad/Cards/proc_card_mg5.dat @@ -46,4 +46,4 @@ define l- = e- mu- define vl = ve vm vt define vl~ = ve~ vm~ vt~ output madevent ../TMPOUT/CODEGEN_mad_gg_ttggg --hel_recycling=False -\ --vector_size=16384 --me_exporter=standalone_cudacpp +-vector_size=32 --me_exporter=standalone_cudacpp diff --git a/epochX/cudacpp/gg_ttggg.mad/Cards/run_card.dat b/epochX/cudacpp/gg_ttggg.mad/Cards/run_card.dat index 0a6bf20eb9..8170176a11 100644 --- a/epochX/cudacpp/gg_ttggg.mad/Cards/run_card.dat +++ b/epochX/cudacpp/gg_ttggg.mad/Cards/run_card.dat @@ -80,7 +80,23 @@ 1 = sde_strategy ! default integration strategy (hep-ph/2021.00773) ! 1 is old strategy (using amp square) ! 2 is new strategy (using only the denominator) -# To see advanced option for Phase-Space optimization: type "update psoptim" +#********************************************************************* +# Phase-Space Optim (advanced) +#********************************************************************* + 0 = job_strategy ! see appendix of 1507.00020 (page 26) + 0 = hard_survey ! force to have better estimate of the integral at survey for difficult mode like interference + -1.0 = tmin_for_channel ! limit the non-singular reach of --some-- channel of integration related to T-channel diagram (value between -1 and 0), -1 is no impact + -1 = survey_splitting ! for loop-induced control how many core are used at survey for the computation of a single iteration. + 2 = survey_nchannel_per_job ! control how many Channel are integrated inside a single job on cluster/multicore + -1 = refine_evt_by_job ! control the maximal number of events for the first iteration of the refine (larger means less jobs) +#********************************************************************* +# Compilation flag. +#********************************************************************* + -O3 -ffast-math -fbounds-check = global_flag ! build flags for all Fortran code (for a fair comparison to cudacpp; default is -O) + --fast-math = aloha_flag ! fortran optimization flag for aloha function. Suggestions: '-ffast-math' + -O3 = matrix_flag ! fortran optimization flag for matrix.f function. Suggestions: '-O3' + 16384 = vector_size ! size of fortran arrays allocated in the multi-event API for SIMD/GPU (VECSIZE_MEMMAX) + #********************************************************************* # Customization (custom cuts/scale/bias/...) * # list of files containing fortran function that overwrite default * @@ -174,12 +190,10 @@ systematics = systematics_program ! none, systematics [python], SysCalc [depreceted, C++] ['--mur=0.5,1,2', '--muf=0.5,1,2', '--pdf=errorset'] = systematics_arguments ! see: https://cp3.irmp.ucl.ac.be/projects/madgraph/wiki/Systematics#Systematicspythonmodule -#********************************************************************* -# Options for the cudacpp plugin -#********************************************************************* - -# Set cudacpp-specific values of non-cudacpp-specific options --O3 -ffast-math -fbounds-check = global_flag ! build flags for Fortran code (for a fair comparison to cudacpp) +#*********************************************************************** +# SIMD/GPU configuration for the CUDACPP plugin +#************************************************************************ + d = floating_type ! floating point precision: f (single), d (double), m (mixed: double for amplitudes, single for colors) + auto = avx_level ! SIMD vectorization level: none, sse4, avx2, 512y, 512z, auto + CPP = cudacpp_backend ! CUDACPP backend: FORTRAN, CPP, CUDA -# New cudacpp-specific options (default values are defined in banner.py) -CPP = cudacpp_backend ! valid backends are FORTRAN, CPP, CUDA diff --git a/epochX/cudacpp/gg_ttggg.mad/Cards/run_card_default.dat b/epochX/cudacpp/gg_ttggg.mad/Cards/run_card_default.dat index 3714e71997..8da9ac1563 100644 --- a/epochX/cudacpp/gg_ttggg.mad/Cards/run_card_default.dat +++ b/epochX/cudacpp/gg_ttggg.mad/Cards/run_card_default.dat @@ -80,7 +80,23 @@ 2 = sde_strategy ! default integration strategy (hep-ph/2021.00773) ! 1 is old strategy (using amp square) ! 2 is new strategy (using only the denominator) -# To see advanced option for Phase-Space optimization: type "update psoptim" +#********************************************************************* +# Phase-Space Optim (advanced) +#********************************************************************* + 0 = job_strategy ! see appendix of 1507.00020 (page 26) + 0 = hard_survey ! force to have better estimate of the integral at survey for difficult mode like interference + -1.0 = tmin_for_channel ! limit the non-singular reach of --some-- channel of integration related to T-channel diagram (value between -1 and 0), -1 is no impact + -1 = survey_splitting ! for loop-induced control how many core are used at survey for the computation of a single iteration. + 2 = survey_nchannel_per_job ! control how many Channel are integrated inside a single job on cluster/multicore + -1 = refine_evt_by_job ! control the maximal number of events for the first iteration of the refine (larger means less jobs) +#********************************************************************* +# Compilation flag. +#********************************************************************* + -O = global_flag ! fortran optimization flag use for the all code. + --fast-math = aloha_flag ! fortran optimization flag for aloha function. Suggestions: '-ffast-math' + -O3 = matrix_flag ! fortran optimization flag for matrix.f function. Suggestions: '-O3' + 16 = vector_size ! size of fortran arrays allocated in the multi-event API for SIMD/GPU (VECSIZE_MEMMAX) + #********************************************************************* # Customization (custom cuts/scale/bias/...) * # list of files containing fortran function that overwrite default * @@ -173,3 +189,11 @@ # systematics = systematics_program ! none, systematics [python], SysCalc [depreceted, C++] ['--mur=0.5,1,2', '--muf=0.5,1,2', '--pdf=errorset'] = systematics_arguments ! see: https://cp3.irmp.ucl.ac.be/projects/madgraph/wiki/Systematics#Systematicspythonmodule + +#*********************************************************************** +# SIMD/GPU configuration for the CUDACPP plugin +#************************************************************************ + d = floating_type ! floating point precision: f (single), d (double), m (mixed: double for amplitudes, single for colors) + auto = avx_level ! SIMD vectorization level: none, sse4, avx2, 512y, 512z, auto + CPP = cudacpp_backend ! CUDACPP backend: FORTRAN, CPP, CUDA + diff --git a/epochX/cudacpp/gg_ttggg.mad/Source/make_opts b/epochX/cudacpp/gg_ttggg.mad/Source/make_opts index 6409e99a49..e4b87ee6ad 100644 --- a/epochX/cudacpp/gg_ttggg.mad/Source/make_opts +++ b/epochX/cudacpp/gg_ttggg.mad/Source/make_opts @@ -1,7 +1,6 @@ DEFAULT_CPP_COMPILER=g++ DEFAULT_F2PY_COMPILER=f2py3 DEFAULT_F_COMPILER=gfortran -#end_of_make_opts_variables GLOBAL_FLAG=-O3 -ffast-math -fbounds-check MACFLAG= MG5AMC_VERSION=SpecifiedByMG5aMCAtRunTime diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/banner.py b/epochX/cudacpp/gg_ttggg.mad/bin/internal/banner.py index 3995ce8109..bd1517985f 100755 --- a/epochX/cudacpp/gg_ttggg.mad/bin/internal/banner.py +++ b/epochX/cudacpp/gg_ttggg.mad/bin/internal/banner.py @@ -2822,13 +2822,15 @@ def add_param(self, name, value, fortran_name=None, include=True, if fct_mod: self.fct_mod[name] = fct_mod - def read(self, finput, consistency=True, unknown_warning=True): + def read(self, finput, consistency=True, unknown_warning=True, **opt): """Read the input file, this can be a path to a file, a file object, a str with the content of the file.""" if isinstance(finput, str): if "\n" in finput: finput = finput.split('\n') + if 'path' in opt: + self.path = opt['path'] elif os.path.isfile(finput): self.path = finput finput = open(finput) @@ -3913,7 +3915,7 @@ def remove_all_cut(self): %(global_flag)s = global_flag ! fortran optimization flag use for the all code. %(aloha_flag)s = aloha_flag ! fortran optimization flag for aloha function. Suggestions: '-ffast-math' %(matrix_flag)s = matrix_flag ! fortran optimization flag for matrix.f function. Suggestions: '-O3' - %(vector_size)s = vector_size ! size designed for SIMD/OpenMP/GPU (number of events in lockstep) + %(vector_size)s = vector_size ! size of fortran arrays allocated in the multi-event API for SIMD/GPU (VECSIZE_MEMMAX) """ template_off = '# To see advanced option for Phase-Space optimization: type "update psoptim"' diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/gg_ttggg.mad/bin/internal/launch_plugin.py index 0b849330ef..3b09713e12 100644 --- a/epochX/cudacpp/gg_ttggg.mad/bin/internal/launch_plugin.py +++ b/epochX/cudacpp/gg_ttggg.mad/bin/internal/launch_plugin.py @@ -16,22 +16,28 @@ import internal.misc as misc import internal.extended_cmd as extended_cmd import internal.banner as banner_mod + import internal.common_run_interface as common_run_interface else: import madgraph.interface.madevent_interface as madevent_interface import madgraph.various.misc as misc import madgraph.interface.extended_cmd as extended_cmd import madgraph.various.banner as banner_mod + import madgraph.interface.common_run_interface as common_run_interface class CPPMEInterface(madevent_interface.MadEventCmdShell): def compile(self, *args, **opts): """ """ import multiprocessing if not self.options['nb_core'] or self.options['nb_core'] == 'None': - self.options['nb_core'] = multiprocessing.cpu_count() - if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'): - import pathlib - import os - pjoin = os.path.join + self.options['nb_core'] = multiprocessing.cpu_count() + if 'cwd' in opts and os.path.basename(opts['cwd']) == 'Source': + path = pjoin(opts['cwd'], 'make_opts') + avx_level = self.run_card['avx_level'] if self.run_card['avx_level'] != 'auto' else '' + common_run_interface.CommonRunCmd.update_make_opts_full(path, + {'FPTYPE': self.run_card['floating_type'], + 'AVX': avx_level }) + misc.sprint('FPTYPE checked') + if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'): cudacpp_backend = self.run_card['cudacpp_backend'].upper() # the default value is defined in banner.py logger.info("Building madevent in madevent_interface.py with '%s' matrix elements"%cudacpp_backend) if cudacpp_backend == 'FORTRAN': @@ -46,30 +52,83 @@ def compile(self, *args, **opts): else: return misc.compile(nb_core=self.options['nb_core'], *args, **opts) +# Phase-Space Optimization ------------------------------------------------------------------------------------ +template_on = \ +"""#*********************************************************************** +# SIMD/GPU configuration for the CUDACPP plugin +#************************************************************************ + %(floating_type)s = floating_type ! floating point precision: f (single), d (double), m (mixed: double for amplitudes, single for colors) + %(avx_level)s = avx_level ! SIMD vectorization level: none, sse4, avx2, 512y, 512z, auto + %(cudacpp_backend)s = cudacpp_backend ! CUDACPP backend: FORTRAN, CPP, CUDA +""" + +template_off = '' +plugin_block = banner_mod.RunBlock('simd', template_on=template_on, template_off=template_off) + class CPPRunCard(banner_mod.RunCardLO): + blocks = banner_mod.RunCardLO.blocks + [plugin_block] + def reset_simd(self, old_value, new_value, name): if not hasattr(self, 'path'): - logger.warning('WARNING! CPPRunCard instance has no attribute path') - return - ###raise Exception('INTERNAL ERROR! CPPRunCard instance has no attribute path') + raise Exception('INTERNAL ERROR! CPPRunCard instance has no attribute path') # now ok after fixing #790 if name == "vector_size" and new_value <= int(old_value): # code can handle the new size -> do not recompile return Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source') subprocess.call(['make', 'cleanavx'], cwd=Sourcedir, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) + def reset_makeopts(self, old_value, new_value, name): + if not hasattr(self, 'path'): + raise Exception + avx_value = self['avx_level'] if self['avx_level'] != 'auto' else '' + if name == 'floating_type': + common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': new_value, 'AVX': avx_value}) + elif name == 'avx_level': + if new_value == 'auto': new_value = '' + common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': self['floating_type'], 'AVX': new_value}) + else: + raise Exception + Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source') + subprocess.call(['make', 'cleanavx'], cwd=Sourcedir, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) + def plugin_input(self, finput): return def default_setup(self): super().default_setup() - self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False) + self.add_param('floating_type', 'd', include=False, hidden=False, + fct_mod=(self.reset_makeopts,(),{}), + allowed=['m','d','f']) + self.add_param('avx_level', 'auto', include=False, hidden=False, + fct_mod=(self.reset_makeopts,(),{}), + allowed=['auto', 'none', 'sse4', 'avx2','512y','512z']) + self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False, + allowed=['Fortan', 'CPP', 'CUDA']) + self['vector_size'] = 16 # already setup in default class (just change value) + self['aloha_flag'] = '--fast-math' + self['matrix_flag'] = '-O3' + self.display_block.append('simd') + self.display_block.append('psoptim') + # OM/AV - overload the default version in banner.py def write_one_include_file(self, output_dir, incname, output_file=None): """write one include file at the time""" - if incname == "vector.inc" and 'vector_size' not in self.user_set: - return - super().write_one_include_file(output_dir, incname, output_file) + if incname == "vector.inc": + if 'vector_size' not in self.user_set: return + if output_file is None: vectorinc=pjoin(output_dir,incname) + else: vectorinc=output_file + with open(vectorinc+'.new','w') as fileout: + with open(vectorinc) as filein: + for line in filein: + if line.startswith('C'): fileout.write(line) + super().write_one_include_file(output_dir, incname, output_file) + with open(vectorinc+'.new','a') as fileout: + with open(vectorinc) as filein: + for line in filein: + if not line.startswith('\n'): fileout.write(line) + os.replace(vectorinc+'.new',vectorinc) + else: + super().write_one_include_file(output_dir, incname, output_file) def check_validity(self): """ensure that PLUGIN information are consistent""" @@ -82,13 +141,10 @@ def check_validity(self): class GPURunCard(CPPRunCard): def default_setup(self): - super(CPPRunCard, self).default_setup() - self.add_param('cudacpp_backend', 'CUDA', include=False, hidden=False) - -#class CUDACPPRunCard(CPPRunCard): -# def default_setup(self): -# super(CPPRunCard, self).default_setup() -# self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False) + super().default_setup() + # change default value: + self['cudacpp_backend'] = 'CUDA' + self['vector_size'] = 16384 # already setup in default class (just change value) MEINTERFACE = CPPMEInterface RunCard = CPPRunCard diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/madevent_interface.py b/epochX/cudacpp/gg_ttggg.mad/bin/internal/madevent_interface.py index 853aabc98a..cb6bf4ca57 100755 --- a/epochX/cudacpp/gg_ttggg.mad/bin/internal/madevent_interface.py +++ b/epochX/cudacpp/gg_ttggg.mad/bin/internal/madevent_interface.py @@ -3191,7 +3191,7 @@ def do_treatcards(self, line, mode=None, opt=None): if mode in ['run', 'all']: if not hasattr(self, 'run_card'): - run_card = banner_mod.RunCard(opt['run_card']) + run_card = banner_mod.RunCard(opt['run_card'], path=pjoin(self.me_dir, 'Cards', 'run_card.dat')) else: run_card = self.run_card self.run_card = run_card diff --git a/epochX/cudacpp/gg_ttggg.mad/mg5.in b/epochX/cudacpp/gg_ttggg.mad/mg5.in index cdbc845cdd..f92d17d219 100644 --- a/epochX/cudacpp/gg_ttggg.mad/mg5.in +++ b/epochX/cudacpp/gg_ttggg.mad/mg5.in @@ -1,4 +1,4 @@ set stdout_level DEBUG set zerowidth_tchannel F generate g g > t t~ g g g -output madevent gg_ttggg.mad --hel_recycling=False --vector_size=16384 --me_exporter=standalone_cudacpp +output madevent gg_ttggg.mad --hel_recycling=False --vector_size=32 --me_exporter=standalone_cudacpp diff --git a/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt b/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt index 33bae20142..8660fec52a 100644 --- a/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt +++ b/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ g g g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005532503128051758  +DEBUG: model prefixing takes 0.005609273910522461  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -155,35 +155,35 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=5: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g g g WEIGHTED<=5 @1 INFO: Process has 1240 diagrams -1 processes with 1240 diagrams generated in 1.880 s +1 processes with 1240 diagrams generated in 1.857 s Total: 1 processes with 1240 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_ttggg Load PLUGIN.CUDACPP_OUTPUT Output will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: cformat =  plugin [export_cpp.py at line 3071]  -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 155]  -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 160]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 160]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 165]  INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ g g g WEIGHTED<=5 @1 INFO: Processing color information for process: g g > t t~ g g g @1 -DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 189]  -DEBUG: type(subproc_group)= [output.py at line 190]  -DEBUG: type(fortran_model)= [output.py at line 191]  -DEBUG: type(me)= me=0 [output.py at line 192]  +DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 194]  +DEBUG: type(subproc_group)= [output.py at line 195]  +DEBUG: type(fortran_model)= [output.py at line 196]  +DEBUG: type(me)= me=0 [output.py at line 197]  INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/./CPPProcess.h FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/./CPPProcess.cc INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/. -Generated helas calls for 1 subprocesses (1240 diagrams) in 6.540 s -DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 197]  +Generated helas calls for 1 subprocesses (1240 diagrams) in 6.536 s +DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.351 s +ALOHA: aloha creates 5 routines in 0.341 s VVV1 VVV1 FFV1 @@ -204,9 +204,8 @@ FileWriter for / FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/. -DEBUG: 'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) =  Entering PLUGIN_ProcessExporter.finalize False [output.py at line 206]  quit -real 0m15.959s -user 0m12.810s -sys 0m0.102s +real 0m12.866s +user 0m12.717s +sys 0m0.100s diff --git a/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt b/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt index 89cb2749b0..97f5e25170 100644 --- a/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt +++ b/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt @@ -61,7 +61,7 @@ set zerowidth_tchannel F define q = u c d s u~ c~ d~ s~ INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.0057373046875  +DEBUG: model prefixing takes 0.005373477935791016  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -170,17 +170,17 @@ INFO: Crossed process found for g u~ > t t~ u~, reuse diagrams. INFO: Crossed process found for g c~ > t t~ c~, reuse diagrams. INFO: Crossed process found for g d~ > t t~ d~, reuse diagrams. INFO: Crossed process found for g s~ > t t~ s~, reuse diagrams. -8 processes with 40 diagrams generated in 0.078 s +8 processes with 40 diagrams generated in 0.077 s Total: 8 processes with 40 diagrams -output madevent ../TMPOUT/CODEGEN_mad_gq_ttq --hel_recycling=False --vector_size=16384 --me_exporter=standalone_cudacpp +output madevent ../TMPOUT/CODEGEN_mad_gq_ttq --hel_recycling=False --vector_size=32 --me_exporter=standalone_cudacpp Load PLUGIN.CUDACPP_OUTPUT Addition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT Output will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: cformat =  standalone_cudacpp [export_cpp.py at line 3071]  -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 155]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 160]  INFO: initialize a new directory: CODEGEN_mad_gq_ttq INFO: remove old information in CODEGEN_mad_gq_ttq -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 160]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 165]  WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq  INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards  @@ -198,7 +198,7 @@ INFO: Combined process g d~ > t t~ d~ WEIGHTED<=3 @1 with process g u~ > t t~ u~ INFO: Combined process g s~ > t t~ s~ WEIGHTED<=3 @1 with process g u~ > t t~ u~ WEIGHTED<=3 @1 INFO: Creating files in directory P1_gu_ttxu DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -207,15 +207,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: config_map =  [1, 2, 3, 4, 5] [export_cpp.py at line 711]  DEBUG: subproc_number =  0 [export_cpp.py at line 712]  DEBUG: Done [export_cpp.py at line 713]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872]  INFO: Generating Feynman diagrams for Process: g u > t t~ u WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gu_ttxu INFO: Creating files in directory P1_gux_ttxux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -224,19 +224,19 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: config_map =  [1, 2, 3, 4, 5] [export_cpp.py at line 711]  DEBUG: subproc_number =  1 [export_cpp.py at line 712]  DEBUG: Done [export_cpp.py at line 713]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872]  INFO: Generating Feynman diagrams for Process: g u~ > t t~ u~ WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gux_ttxux Generated helas calls for 2 subprocesses (10 diagrams) in 0.031 s -Wrote files for 32 helas calls in 0.217 s +Wrote files for 32 helas calls in 0.219 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVV1 routines ALOHA: aloha creates 2 routines in 0.144 s -DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 197]  +DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVV1 routines @@ -260,7 +260,6 @@ save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CO INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate web pages -DEBUG: 'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) =  Entering PLUGIN_ProcessExporter.finalize True [output.py at line 206]  DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common patching file Source/genps.inc patching file Source/makefile @@ -288,16 +287,16 @@ Hunk #2 succeeded at 162 (offset 19 lines). Hunk #3 succeeded at 247 (offset 26 lines). Hunk #4 succeeded at 281 (offset 32 lines). Hunk #5 succeeded at 326 (offset 32 lines). -DEBUG: p.returncode =  0 [output.py at line 232]  +DEBUG: p.returncode =  0 [output.py at line 237]  Output to directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq done. Type "launch" to generate events from this process, or see /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/README Run "open index.html" to see more information about this process. quit -real 0m4.915s -user 0m1.680s -sys 0m0.237s +real 0m1.929s +user 0m1.701s +sys 0m0.227s ************************************************************ * * * W E L C O M E to * @@ -325,9 +324,6 @@ Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt No valid web browser found. Please set in ./input/mg5_configuration.txt treatcards run -run_card missed argument cudacpp_backend. Takes default: CPP -run_card missed argument cudacpp_backend. Takes default: CPP -WARNING! CPPRunCard instance has no attribute path quit INFO: launch in debug mode diff --git a/epochX/cudacpp/gq_ttq.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/gq_ttq.mad/Cards/proc_card_mg5.dat index efb0752a31..c49c39c3c4 100644 --- a/epochX/cudacpp/gq_ttq.mad/Cards/proc_card_mg5.dat +++ b/epochX/cudacpp/gq_ttq.mad/Cards/proc_card_mg5.dat @@ -48,4 +48,4 @@ define vl~ = ve~ vm~ vt~ define q = u c d s u~ c~ d~ s~ generate g q > t t~ q output madevent ../TMPOUT/CODEGEN_mad_gq_ttq --hel_recycling=False --v\ -ector_size=16384 --me_exporter=standalone_cudacpp +ector_size=32 --me_exporter=standalone_cudacpp diff --git a/epochX/cudacpp/gq_ttq.mad/Cards/run_card.dat b/epochX/cudacpp/gq_ttq.mad/Cards/run_card.dat index 69e6fa2bf2..d987e643f0 100644 --- a/epochX/cudacpp/gq_ttq.mad/Cards/run_card.dat +++ b/epochX/cudacpp/gq_ttq.mad/Cards/run_card.dat @@ -82,7 +82,23 @@ 1 = sde_strategy ! default integration strategy (hep-ph/2021.00773) ! 1 is old strategy (using amp square) ! 2 is new strategy (using only the denominator) -# To see advanced option for Phase-Space optimization: type "update psoptim" +#********************************************************************* +# Phase-Space Optim (advanced) +#********************************************************************* + 0 = job_strategy ! see appendix of 1507.00020 (page 26) + 0 = hard_survey ! force to have better estimate of the integral at survey for difficult mode like interference + -1.0 = tmin_for_channel ! limit the non-singular reach of --some-- channel of integration related to T-channel diagram (value between -1 and 0), -1 is no impact + -1 = survey_splitting ! for loop-induced control how many core are used at survey for the computation of a single iteration. + 2 = survey_nchannel_per_job ! control how many Channel are integrated inside a single job on cluster/multicore + -1 = refine_evt_by_job ! control the maximal number of events for the first iteration of the refine (larger means less jobs) +#********************************************************************* +# Compilation flag. +#********************************************************************* + -O3 -ffast-math -fbounds-check = global_flag ! build flags for all Fortran code (for a fair comparison to cudacpp; default is -O) + --fast-math = aloha_flag ! fortran optimization flag for aloha function. Suggestions: '-ffast-math' + -O3 = matrix_flag ! fortran optimization flag for matrix.f function. Suggestions: '-O3' + 16384 = vector_size ! size of fortran arrays allocated in the multi-event API for SIMD/GPU (VECSIZE_MEMMAX) + #********************************************************************* # Customization (custom cuts/scale/bias/...) * # list of files containing fortran function that overwrite default * @@ -145,12 +161,10 @@ systematics = systematics_program ! none, systematics [python], SysCalc [depreceted, C++] ['--mur=0.5,1,2', '--muf=0.5,1,2', '--pdf=errorset'] = systematics_arguments ! see: https://cp3.irmp.ucl.ac.be/projects/madgraph/wiki/Systematics#Systematicspythonmodule -#********************************************************************* -# Options for the cudacpp plugin -#********************************************************************* - -# Set cudacpp-specific values of non-cudacpp-specific options --O3 -ffast-math -fbounds-check = global_flag ! build flags for Fortran code (for a fair comparison to cudacpp) +#*********************************************************************** +# SIMD/GPU configuration for the CUDACPP plugin +#************************************************************************ + d = floating_type ! floating point precision: f (single), d (double), m (mixed: double for amplitudes, single for colors) + auto = avx_level ! SIMD vectorization level: none, sse4, avx2, 512y, 512z, auto + CPP = cudacpp_backend ! CUDACPP backend: FORTRAN, CPP, CUDA -# New cudacpp-specific options (default values are defined in banner.py) -CPP = cudacpp_backend ! valid backends are FORTRAN, CPP, CUDA diff --git a/epochX/cudacpp/gq_ttq.mad/Cards/run_card_default.dat b/epochX/cudacpp/gq_ttq.mad/Cards/run_card_default.dat index 4c99f39248..ef9e4b2707 100644 --- a/epochX/cudacpp/gq_ttq.mad/Cards/run_card_default.dat +++ b/epochX/cudacpp/gq_ttq.mad/Cards/run_card_default.dat @@ -82,7 +82,23 @@ 1 = sde_strategy ! default integration strategy (hep-ph/2021.00773) ! 1 is old strategy (using amp square) ! 2 is new strategy (using only the denominator) -# To see advanced option for Phase-Space optimization: type "update psoptim" +#********************************************************************* +# Phase-Space Optim (advanced) +#********************************************************************* + 0 = job_strategy ! see appendix of 1507.00020 (page 26) + 0 = hard_survey ! force to have better estimate of the integral at survey for difficult mode like interference + -1.0 = tmin_for_channel ! limit the non-singular reach of --some-- channel of integration related to T-channel diagram (value between -1 and 0), -1 is no impact + -1 = survey_splitting ! for loop-induced control how many core are used at survey for the computation of a single iteration. + 2 = survey_nchannel_per_job ! control how many Channel are integrated inside a single job on cluster/multicore + -1 = refine_evt_by_job ! control the maximal number of events for the first iteration of the refine (larger means less jobs) +#********************************************************************* +# Compilation flag. +#********************************************************************* + -O = global_flag ! fortran optimization flag use for the all code. + --fast-math = aloha_flag ! fortran optimization flag for aloha function. Suggestions: '-ffast-math' + -O3 = matrix_flag ! fortran optimization flag for matrix.f function. Suggestions: '-O3' + 16 = vector_size ! size of fortran arrays allocated in the multi-event API for SIMD/GPU (VECSIZE_MEMMAX) + #********************************************************************* # Customization (custom cuts/scale/bias/...) * # list of files containing fortran function that overwrite default * @@ -144,3 +160,11 @@ # systematics = systematics_program ! none, systematics [python], SysCalc [depreceted, C++] ['--mur=0.5,1,2', '--muf=0.5,1,2', '--pdf=errorset'] = systematics_arguments ! see: https://cp3.irmp.ucl.ac.be/projects/madgraph/wiki/Systematics#Systematicspythonmodule + +#*********************************************************************** +# SIMD/GPU configuration for the CUDACPP plugin +#************************************************************************ + d = floating_type ! floating point precision: f (single), d (double), m (mixed: double for amplitudes, single for colors) + auto = avx_level ! SIMD vectorization level: none, sse4, avx2, 512y, 512z, auto + CPP = cudacpp_backend ! CUDACPP backend: FORTRAN, CPP, CUDA + diff --git a/epochX/cudacpp/gq_ttq.mad/Source/make_opts b/epochX/cudacpp/gq_ttq.mad/Source/make_opts index 6409e99a49..e4b87ee6ad 100644 --- a/epochX/cudacpp/gq_ttq.mad/Source/make_opts +++ b/epochX/cudacpp/gq_ttq.mad/Source/make_opts @@ -1,7 +1,6 @@ DEFAULT_CPP_COMPILER=g++ DEFAULT_F2PY_COMPILER=f2py3 DEFAULT_F_COMPILER=gfortran -#end_of_make_opts_variables GLOBAL_FLAG=-O3 -ffast-math -fbounds-check MACFLAG= MG5AMC_VERSION=SpecifiedByMG5aMCAtRunTime diff --git a/epochX/cudacpp/gq_ttq.mad/bin/internal/banner.py b/epochX/cudacpp/gq_ttq.mad/bin/internal/banner.py index 3995ce8109..bd1517985f 100755 --- a/epochX/cudacpp/gq_ttq.mad/bin/internal/banner.py +++ b/epochX/cudacpp/gq_ttq.mad/bin/internal/banner.py @@ -2822,13 +2822,15 @@ def add_param(self, name, value, fortran_name=None, include=True, if fct_mod: self.fct_mod[name] = fct_mod - def read(self, finput, consistency=True, unknown_warning=True): + def read(self, finput, consistency=True, unknown_warning=True, **opt): """Read the input file, this can be a path to a file, a file object, a str with the content of the file.""" if isinstance(finput, str): if "\n" in finput: finput = finput.split('\n') + if 'path' in opt: + self.path = opt['path'] elif os.path.isfile(finput): self.path = finput finput = open(finput) @@ -3913,7 +3915,7 @@ def remove_all_cut(self): %(global_flag)s = global_flag ! fortran optimization flag use for the all code. %(aloha_flag)s = aloha_flag ! fortran optimization flag for aloha function. Suggestions: '-ffast-math' %(matrix_flag)s = matrix_flag ! fortran optimization flag for matrix.f function. Suggestions: '-O3' - %(vector_size)s = vector_size ! size designed for SIMD/OpenMP/GPU (number of events in lockstep) + %(vector_size)s = vector_size ! size of fortran arrays allocated in the multi-event API for SIMD/GPU (VECSIZE_MEMMAX) """ template_off = '# To see advanced option for Phase-Space optimization: type "update psoptim"' diff --git a/epochX/cudacpp/gq_ttq.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/gq_ttq.mad/bin/internal/launch_plugin.py index 0b849330ef..3b09713e12 100644 --- a/epochX/cudacpp/gq_ttq.mad/bin/internal/launch_plugin.py +++ b/epochX/cudacpp/gq_ttq.mad/bin/internal/launch_plugin.py @@ -16,22 +16,28 @@ import internal.misc as misc import internal.extended_cmd as extended_cmd import internal.banner as banner_mod + import internal.common_run_interface as common_run_interface else: import madgraph.interface.madevent_interface as madevent_interface import madgraph.various.misc as misc import madgraph.interface.extended_cmd as extended_cmd import madgraph.various.banner as banner_mod + import madgraph.interface.common_run_interface as common_run_interface class CPPMEInterface(madevent_interface.MadEventCmdShell): def compile(self, *args, **opts): """ """ import multiprocessing if not self.options['nb_core'] or self.options['nb_core'] == 'None': - self.options['nb_core'] = multiprocessing.cpu_count() - if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'): - import pathlib - import os - pjoin = os.path.join + self.options['nb_core'] = multiprocessing.cpu_count() + if 'cwd' in opts and os.path.basename(opts['cwd']) == 'Source': + path = pjoin(opts['cwd'], 'make_opts') + avx_level = self.run_card['avx_level'] if self.run_card['avx_level'] != 'auto' else '' + common_run_interface.CommonRunCmd.update_make_opts_full(path, + {'FPTYPE': self.run_card['floating_type'], + 'AVX': avx_level }) + misc.sprint('FPTYPE checked') + if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'): cudacpp_backend = self.run_card['cudacpp_backend'].upper() # the default value is defined in banner.py logger.info("Building madevent in madevent_interface.py with '%s' matrix elements"%cudacpp_backend) if cudacpp_backend == 'FORTRAN': @@ -46,30 +52,83 @@ def compile(self, *args, **opts): else: return misc.compile(nb_core=self.options['nb_core'], *args, **opts) +# Phase-Space Optimization ------------------------------------------------------------------------------------ +template_on = \ +"""#*********************************************************************** +# SIMD/GPU configuration for the CUDACPP plugin +#************************************************************************ + %(floating_type)s = floating_type ! floating point precision: f (single), d (double), m (mixed: double for amplitudes, single for colors) + %(avx_level)s = avx_level ! SIMD vectorization level: none, sse4, avx2, 512y, 512z, auto + %(cudacpp_backend)s = cudacpp_backend ! CUDACPP backend: FORTRAN, CPP, CUDA +""" + +template_off = '' +plugin_block = banner_mod.RunBlock('simd', template_on=template_on, template_off=template_off) + class CPPRunCard(banner_mod.RunCardLO): + blocks = banner_mod.RunCardLO.blocks + [plugin_block] + def reset_simd(self, old_value, new_value, name): if not hasattr(self, 'path'): - logger.warning('WARNING! CPPRunCard instance has no attribute path') - return - ###raise Exception('INTERNAL ERROR! CPPRunCard instance has no attribute path') + raise Exception('INTERNAL ERROR! CPPRunCard instance has no attribute path') # now ok after fixing #790 if name == "vector_size" and new_value <= int(old_value): # code can handle the new size -> do not recompile return Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source') subprocess.call(['make', 'cleanavx'], cwd=Sourcedir, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) + def reset_makeopts(self, old_value, new_value, name): + if not hasattr(self, 'path'): + raise Exception + avx_value = self['avx_level'] if self['avx_level'] != 'auto' else '' + if name == 'floating_type': + common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': new_value, 'AVX': avx_value}) + elif name == 'avx_level': + if new_value == 'auto': new_value = '' + common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': self['floating_type'], 'AVX': new_value}) + else: + raise Exception + Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source') + subprocess.call(['make', 'cleanavx'], cwd=Sourcedir, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) + def plugin_input(self, finput): return def default_setup(self): super().default_setup() - self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False) + self.add_param('floating_type', 'd', include=False, hidden=False, + fct_mod=(self.reset_makeopts,(),{}), + allowed=['m','d','f']) + self.add_param('avx_level', 'auto', include=False, hidden=False, + fct_mod=(self.reset_makeopts,(),{}), + allowed=['auto', 'none', 'sse4', 'avx2','512y','512z']) + self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False, + allowed=['Fortan', 'CPP', 'CUDA']) + self['vector_size'] = 16 # already setup in default class (just change value) + self['aloha_flag'] = '--fast-math' + self['matrix_flag'] = '-O3' + self.display_block.append('simd') + self.display_block.append('psoptim') + # OM/AV - overload the default version in banner.py def write_one_include_file(self, output_dir, incname, output_file=None): """write one include file at the time""" - if incname == "vector.inc" and 'vector_size' not in self.user_set: - return - super().write_one_include_file(output_dir, incname, output_file) + if incname == "vector.inc": + if 'vector_size' not in self.user_set: return + if output_file is None: vectorinc=pjoin(output_dir,incname) + else: vectorinc=output_file + with open(vectorinc+'.new','w') as fileout: + with open(vectorinc) as filein: + for line in filein: + if line.startswith('C'): fileout.write(line) + super().write_one_include_file(output_dir, incname, output_file) + with open(vectorinc+'.new','a') as fileout: + with open(vectorinc) as filein: + for line in filein: + if not line.startswith('\n'): fileout.write(line) + os.replace(vectorinc+'.new',vectorinc) + else: + super().write_one_include_file(output_dir, incname, output_file) def check_validity(self): """ensure that PLUGIN information are consistent""" @@ -82,13 +141,10 @@ def check_validity(self): class GPURunCard(CPPRunCard): def default_setup(self): - super(CPPRunCard, self).default_setup() - self.add_param('cudacpp_backend', 'CUDA', include=False, hidden=False) - -#class CUDACPPRunCard(CPPRunCard): -# def default_setup(self): -# super(CPPRunCard, self).default_setup() -# self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False) + super().default_setup() + # change default value: + self['cudacpp_backend'] = 'CUDA' + self['vector_size'] = 16384 # already setup in default class (just change value) MEINTERFACE = CPPMEInterface RunCard = CPPRunCard diff --git a/epochX/cudacpp/gq_ttq.mad/bin/internal/madevent_interface.py b/epochX/cudacpp/gq_ttq.mad/bin/internal/madevent_interface.py index 853aabc98a..cb6bf4ca57 100755 --- a/epochX/cudacpp/gq_ttq.mad/bin/internal/madevent_interface.py +++ b/epochX/cudacpp/gq_ttq.mad/bin/internal/madevent_interface.py @@ -3191,7 +3191,7 @@ def do_treatcards(self, line, mode=None, opt=None): if mode in ['run', 'all']: if not hasattr(self, 'run_card'): - run_card = banner_mod.RunCard(opt['run_card']) + run_card = banner_mod.RunCard(opt['run_card'], path=pjoin(self.me_dir, 'Cards', 'run_card.dat')) else: run_card = self.run_card self.run_card = run_card diff --git a/epochX/cudacpp/gq_ttq.mad/mg5.in b/epochX/cudacpp/gq_ttq.mad/mg5.in index e93843b8cd..2273ae9cfd 100644 --- a/epochX/cudacpp/gq_ttq.mad/mg5.in +++ b/epochX/cudacpp/gq_ttq.mad/mg5.in @@ -2,4 +2,4 @@ set stdout_level DEBUG set zerowidth_tchannel F define q = u c d s u~ c~ d~ s~ generate g q > t t~ q -output madevent gq_ttq.mad --hel_recycling=False --vector_size=16384 --me_exporter=standalone_cudacpp +output madevent gq_ttq.mad --hel_recycling=False --vector_size=32 --me_exporter=standalone_cudacpp diff --git a/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt b/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt index 16374bd28e..91509797eb 100644 --- a/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt +++ b/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt @@ -61,7 +61,7 @@ set zerowidth_tchannel F define q = u c d s u~ c~ d~ s~ INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005791902542114258  +DEBUG: model prefixing takes 0.005265951156616211  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -170,14 +170,14 @@ INFO: Crossed process found for g u~ > t t~ u~, reuse diagrams. INFO: Crossed process found for g c~ > t t~ c~, reuse diagrams. INFO: Crossed process found for g d~ > t t~ d~, reuse diagrams. INFO: Crossed process found for g s~ > t t~ s~, reuse diagrams. -8 processes with 40 diagrams generated in 0.078 s +8 processes with 40 diagrams generated in 0.077 s Total: 8 processes with 40 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gq_ttq Load PLUGIN.CUDACPP_OUTPUT Output will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: cformat =  plugin [export_cpp.py at line 3071]  -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 155]  -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 160]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 160]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 165]  INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g u > t t~ u WEIGHTED<=3 @1 @@ -190,28 +190,28 @@ INFO: Processing color information for process: g u~ > t t~ u~ @1 INFO: Combined process g c~ > t t~ c~ WEIGHTED<=3 @1 with process g u~ > t t~ u~ WEIGHTED<=3 @1 INFO: Combined process g d~ > t t~ d~ WEIGHTED<=3 @1 with process g u~ > t t~ u~ WEIGHTED<=3 @1 INFO: Combined process g s~ > t t~ s~ WEIGHTED<=3 @1 with process g u~ > t t~ u~ WEIGHTED<=3 @1 -DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 189]  -DEBUG: type(subproc_group)= [output.py at line 190]  -DEBUG: type(fortran_model)= [output.py at line 191]  -DEBUG: type(me)= me=0 [output.py at line 192]  +DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 194]  +DEBUG: type(subproc_group)= [output.py at line 195]  +DEBUG: type(fortran_model)= [output.py at line 196]  +DEBUG: type(me)= me=0 [output.py at line 197]  INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu/./CPPProcess.h FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu/./CPPProcess.cc INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu/. -DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 189]  -DEBUG: type(subproc_group)= [output.py at line 190]  -DEBUG: type(fortran_model)= [output.py at line 191]  -DEBUG: type(me)= me=1 [output.py at line 192]  +DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 194]  +DEBUG: type(subproc_group)= [output.py at line 195]  +DEBUG: type(fortran_model)= [output.py at line 196]  +DEBUG: type(me)= me=1 [output.py at line 197]  INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux/./CPPProcess.h FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux/./CPPProcess.cc INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux/. Generated helas calls for 2 subprocesses (10 diagrams) in 0.030 s -DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 197]  +DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVV1 routines -ALOHA: aloha creates 2 routines in 0.144 s +ALOHA: aloha creates 2 routines in 0.141 s FFV1 FFV1 FFV1 @@ -225,9 +225,8 @@ FileWriter for / FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/./Parameters_sm.cc INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/. -DEBUG: 'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) =  Entering PLUGIN_ProcessExporter.finalize False [output.py at line 206]  quit -real 0m3.656s -user 0m0.594s -sys 0m0.059s +real 0m0.640s +user 0m0.574s +sys 0m0.061s diff --git a/epochX/cudacpp/heft_gg_h.sa/CODEGEN_cudacpp_heft_gg_h_log.txt b/epochX/cudacpp/heft_gg_h.sa/CODEGEN_cudacpp_heft_gg_h_log.txt index 3b04fc3fb3..452dbff73e 100644 --- a/epochX/cudacpp/heft_gg_h.sa/CODEGEN_cudacpp_heft_gg_h_log.txt +++ b/epochX/cudacpp/heft_gg_h.sa/CODEGEN_cudacpp_heft_gg_h_log.txt @@ -135,25 +135,25 @@ output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_heft_gg_h Load PLUGIN.CUDACPP_OUTPUT Output will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: cformat =  plugin [export_cpp.py at line 3071]  -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 155]  -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 160]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 160]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 165]  INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > h HIG<=1 HIW<=1 WEIGHTED<=2 @1 INFO: Processing color information for process: g g > h HIG<=1 HIW<=1 @1 -DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 189]  -DEBUG: type(subproc_group)= [output.py at line 190]  -DEBUG: type(fortran_model)= [output.py at line 191]  -DEBUG: type(me)= me=0 [output.py at line 192]  +DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 194]  +DEBUG: type(subproc_group)= [output.py at line 195]  +DEBUG: type(fortran_model)= [output.py at line 196]  +DEBUG: type(me)= me=0 [output.py at line 197]  INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h/SubProcesses/P1_Sigma_heft_gg_h FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h/SubProcesses/P1_Sigma_heft_gg_h/./CPPProcess.h FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h/SubProcesses/P1_Sigma_heft_gg_h/./CPPProcess.cc INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h/SubProcesses/P1_Sigma_heft_gg_h/. Generated helas calls for 1 subprocesses (1 diagrams) in 0.002 s -DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 197]  +DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVS3 routines -ALOHA: aloha creates 1 routines in 0.062 s +ALOHA: aloha creates 1 routines in 0.060 s VVS3 FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h/src/./HelAmps_heft.h INFO: Created file HelAmps_heft.h in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h/src/. @@ -163,9 +163,8 @@ FileWriter for / FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h/src/./Parameters_heft.cc INFO: Created files Parameters_heft.h and Parameters_heft.cc in directory INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h/src/. -DEBUG: 'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) =  Entering PLUGIN_ProcessExporter.finalize False [output.py at line 206]  quit -real 0m3.422s -user 0m0.371s -sys 0m0.048s +real 0m0.415s +user 0m0.358s +sys 0m0.050s diff --git a/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt b/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt index 8b6ca99446..e73dd42300 100644 --- a/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt +++ b/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt @@ -61,7 +61,7 @@ set zerowidth_tchannel F define j = p INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.00538325309753418  +DEBUG: model prefixing takes 0.0053882598876953125  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -172,7 +172,7 @@ INFO: Process u~ u > t t~ added to mirror process u u~ > t t~ INFO: Process c~ c > t t~ added to mirror process c c~ > t t~ INFO: Process d~ d > t t~ added to mirror process d d~ > t t~ INFO: Process s~ s > t t~ added to mirror process s s~ > t t~ -5 processes with 7 diagrams generated in 0.029 s +5 processes with 7 diagrams generated in 0.031 s Total: 5 processes with 7 diagrams add process p p > t t~ j @1 INFO: Checking for minimal orders which gives processes. @@ -212,7 +212,7 @@ INFO: Process d~ g > t t~ d~ added to mirror process g d~ > t t~ d~ INFO: Process d~ d > t t~ g added to mirror process d d~ > t t~ g INFO: Process s~ g > t t~ s~ added to mirror process g s~ > t t~ s~ INFO: Process s~ s > t t~ g added to mirror process s s~ > t t~ g -13 processes with 76 diagrams generated in 0.134 s +13 processes with 76 diagrams generated in 0.141 s Total: 18 processes with 83 diagrams add process p p > t t~ j j @2 INFO: Checking for minimal orders which gives processes. @@ -378,17 +378,17 @@ INFO: Process s~ u~ > t t~ u~ s~ added to mirror process u~ s~ > t t~ u~ s~ INFO: Process s~ c~ > t t~ c~ s~ added to mirror process c~ s~ > t t~ c~ s~ INFO: Process s~ d~ > t t~ d~ s~ added to mirror process d~ s~ > t t~ d~ s~ INFO: Crossed process found for s~ s~ > t t~ s~ s~, reuse diagrams. -65 processes with 1119 diagrams generated in 1.811 s +65 processes with 1119 diagrams generated in 1.922 s Total: 83 processes with 1202 diagrams -output madevent ../TMPOUT/CODEGEN_mad_pp_tt012j --hel_recycling=False --vector_size=16384 --me_exporter=standalone_cudacpp +output madevent ../TMPOUT/CODEGEN_mad_pp_tt012j --hel_recycling=False --vector_size=32 --me_exporter=standalone_cudacpp Load PLUGIN.CUDACPP_OUTPUT Addition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT Output will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: cformat =  standalone_cudacpp [export_cpp.py at line 3071]  -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 155]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 160]  INFO: initialize a new directory: CODEGEN_mad_pp_tt012j INFO: remove old information in CODEGEN_mad_pp_tt012j -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 160]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 165]  WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j  INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards  @@ -497,7 +497,7 @@ INFO: Combined process d d~ > t t~ WEIGHTED<=2 with process u u~ > t t~ WEIGHTED INFO: Combined process s s~ > t t~ WEIGHTED<=2 with process u u~ > t t~ WEIGHTED<=2 INFO: Creating files in directory P2_gg_ttxgg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -506,15 +506,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: config_map =  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 0, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 0, 46, 47, 48, 49, 50, 51, 52, 53, 54, 0, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 0, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 0, 88, 89, 90, 91, 92, 93, 0, 94, 95, 96, 97, 98, 99, 0, 100, 101, 102, 103, 104, 105, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] [export_cpp.py at line 711]  DEBUG: subproc_number =  0 [export_cpp.py at line 712]  DEBUG: Done [export_cpp.py at line 713]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872]  INFO: Generating Feynman diagrams for Process: g g > t t~ g g WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group gg_ttxgg INFO: Creating files in directory P2_gg_ttxuux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -523,15 +523,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: config_map =  [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 0, 34, 35] [export_cpp.py at line 711]  DEBUG: subproc_number =  1 [export_cpp.py at line 712]  DEBUG: Done [export_cpp.py at line 713]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872]  INFO: Generating Feynman diagrams for Process: g g > t t~ u u~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group gg_ttxuux INFO: Creating files in directory P2_gu_ttxgu DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -540,15 +540,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: config_map =  [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 0, 34, 35] [export_cpp.py at line 711]  DEBUG: subproc_number =  2 [export_cpp.py at line 712]  DEBUG: Done [export_cpp.py at line 713]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872]  INFO: Generating Feynman diagrams for Process: g u > t t~ g u WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group gu_ttxgu INFO: Creating files in directory P2_gux_ttxgux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -557,15 +557,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: config_map =  [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 0, 34, 35] [export_cpp.py at line 711]  DEBUG: subproc_number =  3 [export_cpp.py at line 712]  DEBUG: Done [export_cpp.py at line 713]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872]  INFO: Generating Feynman diagrams for Process: g u~ > t t~ g u~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group gux_ttxgux INFO: Creating files in directory P2_uux_ttxgg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -574,15 +574,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: config_map =  [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 0, 34, 35] [export_cpp.py at line 711]  DEBUG: subproc_number =  4 [export_cpp.py at line 712]  DEBUG: Done [export_cpp.py at line 713]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872]  INFO: Generating Feynman diagrams for Process: u u~ > t t~ g g WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uux_ttxgg INFO: Creating files in directory P1_gg_ttxg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -591,15 +591,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: config_map =  [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0] [export_cpp.py at line 711]  DEBUG: subproc_number =  5 [export_cpp.py at line 712]  DEBUG: Done [export_cpp.py at line 713]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872]  INFO: Generating Feynman diagrams for Process: g g > t t~ g WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttxg INFO: Creating files in directory P2_uu_ttxuu DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -608,15 +608,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: config_map =  [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14] [export_cpp.py at line 711]  DEBUG: subproc_number =  6 [export_cpp.py at line 712]  DEBUG: Done [export_cpp.py at line 713]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872]  INFO: Generating Feynman diagrams for Process: u u > t t~ u u WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uu_ttxuu INFO: Creating files in directory P2_uux_ttxuux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -625,15 +625,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: config_map =  [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14] [export_cpp.py at line 711]  DEBUG: subproc_number =  7 [export_cpp.py at line 712]  DEBUG: Done [export_cpp.py at line 713]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872]  INFO: Generating Feynman diagrams for Process: u u~ > t t~ u u~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uux_ttxuux INFO: Creating files in directory P2_uxux_ttxuxux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -642,15 +642,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: config_map =  [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14] [export_cpp.py at line 711]  DEBUG: subproc_number =  8 [export_cpp.py at line 712]  DEBUG: Done [export_cpp.py at line 713]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872]  INFO: Generating Feynman diagrams for Process: u~ u~ > t t~ u~ u~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uxux_ttxuxux INFO: Creating files in directory P2_uc_ttxuc DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -659,15 +659,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: config_map =  [1, 2, 3, 4, 5, 6, 7] [export_cpp.py at line 711]  DEBUG: subproc_number =  9 [export_cpp.py at line 712]  DEBUG: Done [export_cpp.py at line 713]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872]  INFO: Generating Feynman diagrams for Process: u c > t t~ u c WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uc_ttxuc INFO: Creating files in directory P2_uux_ttxccx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -676,15 +676,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: config_map =  [1, 2, 3, 4, 5, 6, 7] [export_cpp.py at line 711]  DEBUG: subproc_number =  10 [export_cpp.py at line 712]  DEBUG: Done [export_cpp.py at line 713]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872]  INFO: Generating Feynman diagrams for Process: u u~ > t t~ c c~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uux_ttxccx INFO: Creating files in directory P2_ucx_ttxucx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -693,15 +693,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: config_map =  [1, 2, 3, 4, 5, 6, 7] [export_cpp.py at line 711]  DEBUG: subproc_number =  11 [export_cpp.py at line 712]  DEBUG: Done [export_cpp.py at line 713]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872]  INFO: Generating Feynman diagrams for Process: u c~ > t t~ u c~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group ucx_ttxucx INFO: Creating files in directory P2_uxcx_ttxuxcx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -710,15 +710,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: config_map =  [1, 2, 3, 4, 5, 6, 7] [export_cpp.py at line 711]  DEBUG: subproc_number =  12 [export_cpp.py at line 712]  DEBUG: Done [export_cpp.py at line 713]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872]  INFO: Generating Feynman diagrams for Process: u~ c~ > t t~ u~ c~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uxcx_ttxuxcx INFO: Creating files in directory P1_gu_ttxu DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -727,15 +727,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: config_map =  [1, 2, 3, 4, 5] [export_cpp.py at line 711]  DEBUG: subproc_number =  13 [export_cpp.py at line 712]  DEBUG: Done [export_cpp.py at line 713]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872]  INFO: Generating Feynman diagrams for Process: g u > t t~ u WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gu_ttxu INFO: Creating files in directory P1_gux_ttxux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -744,15 +744,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: config_map =  [1, 2, 3, 4, 5] [export_cpp.py at line 711]  DEBUG: subproc_number =  14 [export_cpp.py at line 712]  DEBUG: Done [export_cpp.py at line 713]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872]  INFO: Generating Feynman diagrams for Process: g u~ > t t~ u~ WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gux_ttxux INFO: Creating files in directory P1_uux_ttxg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -761,15 +761,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: config_map =  [1, 2, 3, 4, 5] [export_cpp.py at line 711]  DEBUG: subproc_number =  15 [export_cpp.py at line 712]  DEBUG: Done [export_cpp.py at line 713]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872]  INFO: Generating Feynman diagrams for Process: u u~ > t t~ g WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group uux_ttxg INFO: Creating files in directory P0_gg_ttx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -778,15 +778,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: config_map =  [1, 2, 3] [export_cpp.py at line 711]  DEBUG: subproc_number =  16 [export_cpp.py at line 712]  DEBUG: Done [export_cpp.py at line 713]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872]  INFO: Generating Feynman diagrams for Process: g g > t t~ WEIGHTED<=2 INFO: Finding symmetric diagrams for subprocess group gg_ttx INFO: Creating files in directory P0_uux_ttx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -795,29 +795,29 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: config_map =  [1] [export_cpp.py at line 711]  DEBUG: subproc_number =  17 [export_cpp.py at line 712]  DEBUG: Done [export_cpp.py at line 713]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  -DEBUG: vector, subproc_group,self.opt['vector_size'] =  16384 True 16384 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  False True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872]  +DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872]  INFO: Generating Feynman diagrams for Process: u u~ > t t~ WEIGHTED<=2 INFO: Finding symmetric diagrams for subprocess group uux_ttx -Generated helas calls for 18 subprocesses (372 diagrams) in 1.267 s -Wrote files for 810 helas calls in 3.215 s +Generated helas calls for 18 subprocesses (372 diagrams) in 1.279 s +Wrote files for 810 helas calls in 3.193 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.333 s -DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 197]  +ALOHA: aloha creates 5 routines in 0.331 s +DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 10 routines in 0.312 s +ALOHA: aloha creates 10 routines in 0.310 s VVV1 VVV1 FFV1 @@ -844,7 +844,6 @@ save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CO INFO: Use Fortran compiler gfortran INFO: Use c++ compiler g++ INFO: Generate web pages -DEBUG: 'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) =  Entering PLUGIN_ProcessExporter.finalize True [output.py at line 206]  DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common patching file Source/genps.inc patching file Source/makefile @@ -1022,16 +1021,16 @@ Hunk #2 succeeded at 194 (offset 51 lines). Hunk #3 succeeded at 272 (offset 51 lines). Hunk #4 succeeded at 300 (offset 51 lines). Hunk #5 succeeded at 345 (offset 51 lines). -DEBUG: p.returncode =  0 [output.py at line 232]  +DEBUG: p.returncode =  0 [output.py at line 237]  Output to directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j done. Type "launch" to generate events from this process, or see /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/README Run "open index.html" to see more information about this process. quit -real 0m11.764s -user 0m8.242s -sys 0m0.480s +real 0m8.865s +user 0m8.358s +sys 0m0.477s ************************************************************ * * * W E L C O M E to * @@ -1059,9 +1058,6 @@ Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt No valid web browser found. Please set in ./input/mg5_configuration.txt treatcards run -run_card missed argument cudacpp_backend. Takes default: CPP -run_card missed argument cudacpp_backend. Takes default: CPP -WARNING! CPPRunCard instance has no attribute path quit INFO: launch in debug mode diff --git a/epochX/cudacpp/pp_tt012j.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/pp_tt012j.mad/Cards/proc_card_mg5.dat index c0b1a2fd98..22837639b6 100644 --- a/epochX/cudacpp/pp_tt012j.mad/Cards/proc_card_mg5.dat +++ b/epochX/cudacpp/pp_tt012j.mad/Cards/proc_card_mg5.dat @@ -50,4 +50,4 @@ generate p p > t t~ @0 add process p p > t t~ j @1 add process p p > t t~ j j @2 output madevent ../TMPOUT/CODEGEN_mad_pp_tt012j --hel_recycling=False \ ---vector_size=16384 --me_exporter=standalone_cudacpp +--vector_size=32 --me_exporter=standalone_cudacpp diff --git a/epochX/cudacpp/pp_tt012j.mad/Cards/run_card.dat b/epochX/cudacpp/pp_tt012j.mad/Cards/run_card.dat index f4ca7c0b5d..8b1007a3e5 100644 --- a/epochX/cudacpp/pp_tt012j.mad/Cards/run_card.dat +++ b/epochX/cudacpp/pp_tt012j.mad/Cards/run_card.dat @@ -98,7 +98,23 @@ 1 = sde_strategy ! default integration strategy (hep-ph/2021.00773) ! 1 is old strategy (using amp square) ! 2 is new strategy (using only the denominator) -# To see advanced option for Phase-Space optimization: type "update psoptim" +#********************************************************************* +# Phase-Space Optim (advanced) +#********************************************************************* + 0 = job_strategy ! see appendix of 1507.00020 (page 26) + 0 = hard_survey ! force to have better estimate of the integral at survey for difficult mode like interference + -1.0 = tmin_for_channel ! limit the non-singular reach of --some-- channel of integration related to T-channel diagram (value between -1 and 0), -1 is no impact + -1 = survey_splitting ! for loop-induced control how many core are used at survey for the computation of a single iteration. + 2 = survey_nchannel_per_job ! control how many Channel are integrated inside a single job on cluster/multicore + -1 = refine_evt_by_job ! control the maximal number of events for the first iteration of the refine (larger means less jobs) +#********************************************************************* +# Compilation flag. +#********************************************************************* + -O3 -ffast-math -fbounds-check = global_flag ! build flags for all Fortran code (for a fair comparison to cudacpp; default is -O) + --fast-math = aloha_flag ! fortran optimization flag for aloha function. Suggestions: '-ffast-math' + -O3 = matrix_flag ! fortran optimization flag for matrix.f function. Suggestions: '-O3' + 16384 = vector_size ! size of fortran arrays allocated in the multi-event API for SIMD/GPU (VECSIZE_MEMMAX) + #********************************************************************* # Customization (custom cuts/scale/bias/...) * # list of files containing fortran function that overwrite default * @@ -186,12 +202,10 @@ systematics = systematics_program ! none, systematics [python], SysCalc [depreceted, C++] ['--mur=0.5,1,2', '--muf=0.5,1,2', '--pdf=errorset', '--alps=0.5,1,2'] = systematics_arguments ! see: https://cp3.irmp.ucl.ac.be/projects/madgraph/wiki/Systematics#Systematicspythonmodule -#********************************************************************* -# Options for the cudacpp plugin -#********************************************************************* - -# Set cudacpp-specific values of non-cudacpp-specific options --O3 -ffast-math -fbounds-check = global_flag ! build flags for Fortran code (for a fair comparison to cudacpp) +#*********************************************************************** +# SIMD/GPU configuration for the CUDACPP plugin +#************************************************************************ + d = floating_type ! floating point precision: f (single), d (double), m (mixed: double for amplitudes, single for colors) + auto = avx_level ! SIMD vectorization level: none, sse4, avx2, 512y, 512z, auto + CPP = cudacpp_backend ! CUDACPP backend: FORTRAN, CPP, CUDA -# New cudacpp-specific options (default values are defined in banner.py) -CPP = cudacpp_backend ! valid backends are FORTRAN, CPP, CUDA diff --git a/epochX/cudacpp/pp_tt012j.mad/Cards/run_card_default.dat b/epochX/cudacpp/pp_tt012j.mad/Cards/run_card_default.dat index 58e12c11d7..2f72b65255 100644 --- a/epochX/cudacpp/pp_tt012j.mad/Cards/run_card_default.dat +++ b/epochX/cudacpp/pp_tt012j.mad/Cards/run_card_default.dat @@ -98,7 +98,23 @@ 1 = sde_strategy ! default integration strategy (hep-ph/2021.00773) ! 1 is old strategy (using amp square) ! 2 is new strategy (using only the denominator) -# To see advanced option for Phase-Space optimization: type "update psoptim" +#********************************************************************* +# Phase-Space Optim (advanced) +#********************************************************************* + 0 = job_strategy ! see appendix of 1507.00020 (page 26) + 0 = hard_survey ! force to have better estimate of the integral at survey for difficult mode like interference + -1.0 = tmin_for_channel ! limit the non-singular reach of --some-- channel of integration related to T-channel diagram (value between -1 and 0), -1 is no impact + -1 = survey_splitting ! for loop-induced control how many core are used at survey for the computation of a single iteration. + 2 = survey_nchannel_per_job ! control how many Channel are integrated inside a single job on cluster/multicore + -1 = refine_evt_by_job ! control the maximal number of events for the first iteration of the refine (larger means less jobs) +#********************************************************************* +# Compilation flag. +#********************************************************************* + -O = global_flag ! fortran optimization flag use for the all code. + --fast-math = aloha_flag ! fortran optimization flag for aloha function. Suggestions: '-ffast-math' + -O3 = matrix_flag ! fortran optimization flag for matrix.f function. Suggestions: '-O3' + 16 = vector_size ! size of fortran arrays allocated in the multi-event API for SIMD/GPU (VECSIZE_MEMMAX) + #********************************************************************* # Customization (custom cuts/scale/bias/...) * # list of files containing fortran function that overwrite default * @@ -185,3 +201,11 @@ # systematics = systematics_program ! none, systematics [python], SysCalc [depreceted, C++] ['--mur=0.5,1,2', '--muf=0.5,1,2', '--pdf=errorset', '--alps=0.5,1,2'] = systematics_arguments ! see: https://cp3.irmp.ucl.ac.be/projects/madgraph/wiki/Systematics#Systematicspythonmodule + +#*********************************************************************** +# SIMD/GPU configuration for the CUDACPP plugin +#************************************************************************ + d = floating_type ! floating point precision: f (single), d (double), m (mixed: double for amplitudes, single for colors) + auto = avx_level ! SIMD vectorization level: none, sse4, avx2, 512y, 512z, auto + CPP = cudacpp_backend ! CUDACPP backend: FORTRAN, CPP, CUDA + diff --git a/epochX/cudacpp/pp_tt012j.mad/Source/make_opts b/epochX/cudacpp/pp_tt012j.mad/Source/make_opts index 6409e99a49..e4b87ee6ad 100644 --- a/epochX/cudacpp/pp_tt012j.mad/Source/make_opts +++ b/epochX/cudacpp/pp_tt012j.mad/Source/make_opts @@ -1,7 +1,6 @@ DEFAULT_CPP_COMPILER=g++ DEFAULT_F2PY_COMPILER=f2py3 DEFAULT_F_COMPILER=gfortran -#end_of_make_opts_variables GLOBAL_FLAG=-O3 -ffast-math -fbounds-check MACFLAG= MG5AMC_VERSION=SpecifiedByMG5aMCAtRunTime diff --git a/epochX/cudacpp/pp_tt012j.mad/bin/internal/banner.py b/epochX/cudacpp/pp_tt012j.mad/bin/internal/banner.py index 3995ce8109..bd1517985f 100755 --- a/epochX/cudacpp/pp_tt012j.mad/bin/internal/banner.py +++ b/epochX/cudacpp/pp_tt012j.mad/bin/internal/banner.py @@ -2822,13 +2822,15 @@ def add_param(self, name, value, fortran_name=None, include=True, if fct_mod: self.fct_mod[name] = fct_mod - def read(self, finput, consistency=True, unknown_warning=True): + def read(self, finput, consistency=True, unknown_warning=True, **opt): """Read the input file, this can be a path to a file, a file object, a str with the content of the file.""" if isinstance(finput, str): if "\n" in finput: finput = finput.split('\n') + if 'path' in opt: + self.path = opt['path'] elif os.path.isfile(finput): self.path = finput finput = open(finput) @@ -3913,7 +3915,7 @@ def remove_all_cut(self): %(global_flag)s = global_flag ! fortran optimization flag use for the all code. %(aloha_flag)s = aloha_flag ! fortran optimization flag for aloha function. Suggestions: '-ffast-math' %(matrix_flag)s = matrix_flag ! fortran optimization flag for matrix.f function. Suggestions: '-O3' - %(vector_size)s = vector_size ! size designed for SIMD/OpenMP/GPU (number of events in lockstep) + %(vector_size)s = vector_size ! size of fortran arrays allocated in the multi-event API for SIMD/GPU (VECSIZE_MEMMAX) """ template_off = '# To see advanced option for Phase-Space optimization: type "update psoptim"' diff --git a/epochX/cudacpp/pp_tt012j.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/pp_tt012j.mad/bin/internal/launch_plugin.py index 0b849330ef..3b09713e12 100644 --- a/epochX/cudacpp/pp_tt012j.mad/bin/internal/launch_plugin.py +++ b/epochX/cudacpp/pp_tt012j.mad/bin/internal/launch_plugin.py @@ -16,22 +16,28 @@ import internal.misc as misc import internal.extended_cmd as extended_cmd import internal.banner as banner_mod + import internal.common_run_interface as common_run_interface else: import madgraph.interface.madevent_interface as madevent_interface import madgraph.various.misc as misc import madgraph.interface.extended_cmd as extended_cmd import madgraph.various.banner as banner_mod + import madgraph.interface.common_run_interface as common_run_interface class CPPMEInterface(madevent_interface.MadEventCmdShell): def compile(self, *args, **opts): """ """ import multiprocessing if not self.options['nb_core'] or self.options['nb_core'] == 'None': - self.options['nb_core'] = multiprocessing.cpu_count() - if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'): - import pathlib - import os - pjoin = os.path.join + self.options['nb_core'] = multiprocessing.cpu_count() + if 'cwd' in opts and os.path.basename(opts['cwd']) == 'Source': + path = pjoin(opts['cwd'], 'make_opts') + avx_level = self.run_card['avx_level'] if self.run_card['avx_level'] != 'auto' else '' + common_run_interface.CommonRunCmd.update_make_opts_full(path, + {'FPTYPE': self.run_card['floating_type'], + 'AVX': avx_level }) + misc.sprint('FPTYPE checked') + if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'): cudacpp_backend = self.run_card['cudacpp_backend'].upper() # the default value is defined in banner.py logger.info("Building madevent in madevent_interface.py with '%s' matrix elements"%cudacpp_backend) if cudacpp_backend == 'FORTRAN': @@ -46,30 +52,83 @@ def compile(self, *args, **opts): else: return misc.compile(nb_core=self.options['nb_core'], *args, **opts) +# Phase-Space Optimization ------------------------------------------------------------------------------------ +template_on = \ +"""#*********************************************************************** +# SIMD/GPU configuration for the CUDACPP plugin +#************************************************************************ + %(floating_type)s = floating_type ! floating point precision: f (single), d (double), m (mixed: double for amplitudes, single for colors) + %(avx_level)s = avx_level ! SIMD vectorization level: none, sse4, avx2, 512y, 512z, auto + %(cudacpp_backend)s = cudacpp_backend ! CUDACPP backend: FORTRAN, CPP, CUDA +""" + +template_off = '' +plugin_block = banner_mod.RunBlock('simd', template_on=template_on, template_off=template_off) + class CPPRunCard(banner_mod.RunCardLO): + blocks = banner_mod.RunCardLO.blocks + [plugin_block] + def reset_simd(self, old_value, new_value, name): if not hasattr(self, 'path'): - logger.warning('WARNING! CPPRunCard instance has no attribute path') - return - ###raise Exception('INTERNAL ERROR! CPPRunCard instance has no attribute path') + raise Exception('INTERNAL ERROR! CPPRunCard instance has no attribute path') # now ok after fixing #790 if name == "vector_size" and new_value <= int(old_value): # code can handle the new size -> do not recompile return Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source') subprocess.call(['make', 'cleanavx'], cwd=Sourcedir, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) + def reset_makeopts(self, old_value, new_value, name): + if not hasattr(self, 'path'): + raise Exception + avx_value = self['avx_level'] if self['avx_level'] != 'auto' else '' + if name == 'floating_type': + common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': new_value, 'AVX': avx_value}) + elif name == 'avx_level': + if new_value == 'auto': new_value = '' + common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': self['floating_type'], 'AVX': new_value}) + else: + raise Exception + Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source') + subprocess.call(['make', 'cleanavx'], cwd=Sourcedir, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) + def plugin_input(self, finput): return def default_setup(self): super().default_setup() - self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False) + self.add_param('floating_type', 'd', include=False, hidden=False, + fct_mod=(self.reset_makeopts,(),{}), + allowed=['m','d','f']) + self.add_param('avx_level', 'auto', include=False, hidden=False, + fct_mod=(self.reset_makeopts,(),{}), + allowed=['auto', 'none', 'sse4', 'avx2','512y','512z']) + self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False, + allowed=['Fortan', 'CPP', 'CUDA']) + self['vector_size'] = 16 # already setup in default class (just change value) + self['aloha_flag'] = '--fast-math' + self['matrix_flag'] = '-O3' + self.display_block.append('simd') + self.display_block.append('psoptim') + # OM/AV - overload the default version in banner.py def write_one_include_file(self, output_dir, incname, output_file=None): """write one include file at the time""" - if incname == "vector.inc" and 'vector_size' not in self.user_set: - return - super().write_one_include_file(output_dir, incname, output_file) + if incname == "vector.inc": + if 'vector_size' not in self.user_set: return + if output_file is None: vectorinc=pjoin(output_dir,incname) + else: vectorinc=output_file + with open(vectorinc+'.new','w') as fileout: + with open(vectorinc) as filein: + for line in filein: + if line.startswith('C'): fileout.write(line) + super().write_one_include_file(output_dir, incname, output_file) + with open(vectorinc+'.new','a') as fileout: + with open(vectorinc) as filein: + for line in filein: + if not line.startswith('\n'): fileout.write(line) + os.replace(vectorinc+'.new',vectorinc) + else: + super().write_one_include_file(output_dir, incname, output_file) def check_validity(self): """ensure that PLUGIN information are consistent""" @@ -82,13 +141,10 @@ def check_validity(self): class GPURunCard(CPPRunCard): def default_setup(self): - super(CPPRunCard, self).default_setup() - self.add_param('cudacpp_backend', 'CUDA', include=False, hidden=False) - -#class CUDACPPRunCard(CPPRunCard): -# def default_setup(self): -# super(CPPRunCard, self).default_setup() -# self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False) + super().default_setup() + # change default value: + self['cudacpp_backend'] = 'CUDA' + self['vector_size'] = 16384 # already setup in default class (just change value) MEINTERFACE = CPPMEInterface RunCard = CPPRunCard diff --git a/epochX/cudacpp/pp_tt012j.mad/bin/internal/madevent_interface.py b/epochX/cudacpp/pp_tt012j.mad/bin/internal/madevent_interface.py index 853aabc98a..cb6bf4ca57 100755 --- a/epochX/cudacpp/pp_tt012j.mad/bin/internal/madevent_interface.py +++ b/epochX/cudacpp/pp_tt012j.mad/bin/internal/madevent_interface.py @@ -3191,7 +3191,7 @@ def do_treatcards(self, line, mode=None, opt=None): if mode in ['run', 'all']: if not hasattr(self, 'run_card'): - run_card = banner_mod.RunCard(opt['run_card']) + run_card = banner_mod.RunCard(opt['run_card'], path=pjoin(self.me_dir, 'Cards', 'run_card.dat')) else: run_card = self.run_card self.run_card = run_card diff --git a/epochX/cudacpp/pp_tt012j.mad/mg5.in b/epochX/cudacpp/pp_tt012j.mad/mg5.in index 91e22f5295..6bc40e7968 100644 --- a/epochX/cudacpp/pp_tt012j.mad/mg5.in +++ b/epochX/cudacpp/pp_tt012j.mad/mg5.in @@ -4,4 +4,4 @@ define j = p generate p p > t t~ @0 add process p p > t t~ j @1 add process p p > t t~ j j @2 -output madevent pp_tt012j.mad --hel_recycling=False --vector_size=16384 --me_exporter=standalone_cudacpp +output madevent pp_tt012j.mad --hel_recycling=False --vector_size=32 --me_exporter=standalone_cudacpp