From 95e9fd05d8ab8baa920760aaca93776430a71ad8 Mon Sep 17 00:00:00 2001
From: Olivier Mattelaer <olivier.mattelaer@uclouvain.be>
Date: Fri, 3 Nov 2023 08:24:09 +0100
Subject: [PATCH 01/28] add the option to choose the floating_point interface
 from the run_card (via make_opts)

---
 .../PLUGIN/CUDACPP_SA_OUTPUT/launch_plugin.py | 32 +++++++++++++++----
 1 file changed, 26 insertions(+), 6 deletions(-)

diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/launch_plugin.py b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/launch_plugin.py
index c9d1c7706a..56c7ebaea4 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/launch_plugin.py
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/launch_plugin.py
@@ -16,11 +16,13 @@
     import internal.misc as misc
     import internal.extended_cmd as extended_cmd
     import internal.banner as banner_mod
+    import internal.common_run_interface as common_run_interface
 else:
     import madgraph.interface.madevent_interface as madevent_interface
     import madgraph.various.misc as misc
     import madgraph.interface.extended_cmd as extended_cmd
     import madgraph.various.banner as banner_mod
+    import madgraph.interface.common_run_interface as common_run_interface
 
 class CPPMEInterface(madevent_interface.MadEventCmdShell):
     def compile(self, *args, **opts):
@@ -32,6 +34,11 @@ def compile(self, *args, **opts):
             import pathlib
             import os
             pjoin = os.path.join
+            if 'cwd' in opts:
+                path = pjoin(opts['cwd'], os.pardir, os.pardir, 'Source', 'make_opts')
+                common_run_interface.CommonRunCmd.update_make_opts_full(
+                path,
+                {'FPTYPE': self.run_card['floating_type']})
             cudacpp_backend = self.run_card['cudacpp_backend'].upper() # the default value is defined in banner.py
             logger.info("Building madevent in madevent_interface.py with '%s' matrix elements"%cudacpp_backend)
             if cudacpp_backend == 'FORTRAN':
@@ -57,13 +64,29 @@ def reset_simd(self, old_value, new_value, name):
             return
         Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source')
         subprocess.call(['make', 'cleanavx'], cwd=Sourcedir, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
-        
+
+    def reset_makeopts(self, old_value, new_value, name):
+
+        misc.sprint("PASS IN RESET MAKEOPTS", old_value, new_value, name)
+        if not hasattr(self, 'path'):
+            raise Exception
+
+        if name == 'floating_type':
+            common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': new_value})
+        else:
+            raise Exception
+
+        Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source')
+        subprocess.call(['make', 'cleanavx'], cwd=Sourcedir, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+
     def plugin_input(self, finput):
         return
 
     def default_setup(self):
         super().default_setup()
         self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False)
+        self.add_param('floating_type', 'd', include=False, hidden=False,
+                       fct_mod=(self.reset_makeopts,(),{}))    
 
     def write_one_include_file(self, output_dir, incname, output_file=None):
         """write one include file at the time"""
@@ -79,16 +102,13 @@ def check_validity(self):
             self['sde_strategy'] = 1
         if self['hel_recycling']:
             self['hel_recycling'] = False
-            
+
 class GPURunCard(CPPRunCard):
     def default_setup(self):
         super(CPPRunCard, self).default_setup()
         self.add_param('cudacpp_backend', 'CUDA', include=False, hidden=False)
+        self.add_param('floating_type', 'd', include=False, hidden=False)
 
-#class CUDACPPRunCard(CPPRunCard):
-#    def default_setup(self):
-#        super(CPPRunCard, self).default_setup()
-#        self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False)
 
 MEINTERFACE = CPPMEInterface
 RunCard = CPPRunCard

From cf822a089cc65914ce2f475a7ba5077d8f2a5bb1 Mon Sep 17 00:00:00 2001
From: Olivier Mattelaer <olivier.mattelaer@uclouvain.be>
Date: Mon, 6 Nov 2023 13:54:02 +0100
Subject: [PATCH 02/28] better formatting for the card from the start and
 adding the possiblitity to choose avx

---
 MG5aMC/mg5amcnlo                              |  2 +-
 .../PLUGIN/CUDACPP_SA_OUTPUT/launch_plugin.py | 74 +++++++++++++++----
 .../PLUGIN/CUDACPP_SA_OUTPUT/output.py        |  9 ++-
 3 files changed, 66 insertions(+), 19 deletions(-)

diff --git a/MG5aMC/mg5amcnlo b/MG5aMC/mg5amcnlo
index 49c93e01b8..8a18cc2423 160000
--- a/MG5aMC/mg5amcnlo
+++ b/MG5aMC/mg5amcnlo
@@ -1 +1 @@
-Subproject commit 49c93e01b8596cbdb4e65f628601de1e6f08c744
+Subproject commit 8a18cc2423616ee91c4f9d74eec0cb2901e0fd2a
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/launch_plugin.py b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/launch_plugin.py
index 56c7ebaea4..120d30304b 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/launch_plugin.py
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/launch_plugin.py
@@ -30,15 +30,17 @@ def compile(self, *args, **opts):
         import multiprocessing
         if not self.options['nb_core'] or self.options['nb_core'] == 'None':
             self.options['nb_core'] = multiprocessing.cpu_count()
-        if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'):
-            import pathlib
-            import os
-            pjoin = os.path.join
-            if 'cwd' in opts:
-                path = pjoin(opts['cwd'], os.pardir, os.pardir, 'Source', 'make_opts')
-                common_run_interface.CommonRunCmd.update_make_opts_full(
-                path,
-                {'FPTYPE': self.run_card['floating_type']})
+    
+        if 'cwd' in opts and os.path.basename(opts['cwd']) == 'Source':
+            path = pjoin(opts['cwd'], 'make_opts')
+            avx_type = self.run_card['avx_type'] if self.run_card['avx_type'] != 'auto' else ''
+            common_run_interface.CommonRunCmd.update_make_opts_full(path,
+                {'FPTYPE': self.run_card['floating_type'],
+                 'AVX':  avx_type })
+            misc.sprint('FPTYPE checked')
+
+
+        if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'):            
             cudacpp_backend = self.run_card['cudacpp_backend'].upper() # the default value is defined in banner.py
             logger.info("Building madevent in madevent_interface.py with '%s' matrix elements"%cudacpp_backend)
             if cudacpp_backend == 'FORTRAN':
@@ -53,7 +55,26 @@ def compile(self, *args, **opts):
         else:
             return misc.compile(nb_core=self.options['nb_core'], *args, **opts)
 
+
+# Phase-Space Optimization ------------------------------------------------------------------------------------
+template_on = \
+"""#*********************************************************************
+# SIMD/GPU Parametrization
+#*********************************************************************
+   %(floating_type)s = floating_type ! single precision(f), double precision (d), mixed (m) [double for amplitude, single for color]
+   %(avx_type)s =  avx_type  ! for SIMD, technology to use for the vectorization
+   %(cudacpp_backend)s = cudacpp_backend ! Fortran/CPP/CUDA switch mode to use
+"""
+
+template_off = ''
+
+plugin_block = banner_mod.RunBlock('simd', template_on=template_on, template_off=template_off)
+
+
 class CPPRunCard(banner_mod.RunCardLO):
+
+    blocks = banner_mod.RunCardLO.blocks + [plugin_block]
+
     def reset_simd(self, old_value, new_value, name):
         if not hasattr(self, 'path'):
             logger.warning('WARNING! CPPRunCard instance has no attribute path')
@@ -67,12 +88,17 @@ def reset_simd(self, old_value, new_value, name):
 
     def reset_makeopts(self, old_value, new_value, name):
 
-        misc.sprint("PASS IN RESET MAKEOPTS", old_value, new_value, name)
         if not hasattr(self, 'path'):
             raise Exception
 
+        avx_value = self['avx_type'] if self['avx_type'] != 'auto' else ''
+
         if name == 'floating_type':
-            common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': new_value})
+            common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': new_value, 'AVX': avx_value})
+        elif name == 'avx_type':
+            if new_value == 'Auto':
+                new_value = ''
+            common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': self['floating_type'], 'AVX': new_value})
         else:
             raise Exception
 
@@ -84,9 +110,22 @@ def plugin_input(self, finput):
 
     def default_setup(self):
         super().default_setup()
-        self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False)
         self.add_param('floating_type', 'd', include=False, hidden=False,
-                       fct_mod=(self.reset_makeopts,(),{}))    
+                       fct_mod=(self.reset_makeopts,(),{}),
+                       allowed=['m','d','f'])
+        self.add_param('avx_type', 'auto', include=False, hidden=False,
+                       fct_mod=(self.reset_makeopts,(),{}),
+                       allowed=['auto', 'none', 'sse4', 'avx2','512y','512z'])
+        self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False,
+                       allowed=['Fortan', 'CPP', 'CUDA'])
+        self['vector_size'] = 16 # already setup in default class (just change value)
+        self['aloha_flag'] = '--fast-math'
+        self['matrix_flag'] = '-O3'
+        self.display_block.append('simd')
+        self.display_block.append('psoptim')
+
+
+>>>>>>> 746f16476 (better formatting for the card from the start and adding the possiblitity to choose avx)
 
     def write_one_include_file(self, output_dir, incname, output_file=None):
         """write one include file at the time"""
@@ -104,10 +143,13 @@ def check_validity(self):
             self['hel_recycling'] = False
 
 class GPURunCard(CPPRunCard):
+
     def default_setup(self):
-        super(CPPRunCard, self).default_setup()
-        self.add_param('cudacpp_backend', 'CUDA', include=False, hidden=False)
-        self.add_param('floating_type', 'd', include=False, hidden=False)
+
+        super().default_setup()
+        # change default value:
+        self['cudacpp_backend'] = 'CUDA'
+        self['vector_size'] = 16384 # already setup in default class (just change value)
 
 
 MEINTERFACE = CPPMEInterface
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py
index 8961036fb1..80d0c7e8bc 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py
@@ -149,6 +149,11 @@ class PLUGIN_ProcessExporter(PLUGIN_export_cpp.ProcessExporterGPU):
     ###helas_exporter = None
     helas_exporter = model_handling.PLUGIN_GPUFOHelasCallWriter # this is one of the main fixes for issue #341!
 
+    # Default class for the run_card to use
+    from . import launch_plugin
+    run_card_class = launch_plugin.CPPRunCard
+
+
     # AV (default from OM's tutorial) - add a debug printout
     def __init__(self, *args, **kwargs):
         self.in_madevent_mode = False # see MR #747
@@ -203,7 +208,6 @@ def finalize(self, matrix_element, cmdhistory, MG5options, outputflag):
 	    cmdhistory is the list of command used so far.
 	    MG5options are all the options of the main interface
 	    outputflags is a list of options provided when doing the output command"""
-        misc.sprint('Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self))
         if self.in_madevent_mode:
             self.add_input_for_banner()
             if 'CUDACPP_CODEGEN_PATCHLEVEL' in os.environ: patchlevel = os.environ['CUDACPP_CODEGEN_PATCHLEVEL']
@@ -237,7 +241,8 @@ def finalize(self, matrix_element, cmdhistory, MG5options, outputflag):
                 raise Exception('ERROR! the O/S call to patchMad.sh failed')
             # Additional patching (OM)
             self.add_madevent_plugin_fct() # Added by OM
-        return super().finalize(matrix_element, cmdhistory, MG5options, outputflag)
+        # do not call standard finalize since is this is already done...
+        #return super().finalize(matrix_element, cmdhistory, MG5options, outputflag)
 
     # AV (default from OM's tutorial) - overload settings and add a debug printout
     def modify_grouping(self, matrix_element):

From 53e65d00b24b1484c4effa3b012f28dd11511160 Mon Sep 17 00:00:00 2001
From: Andrea Valassi <andrea.valassi@cern.ch>
Date: Thu, 9 Nov 2023 13:36:19 +0100
Subject: [PATCH 03/28] [floating_type_interface] in
 CODEGEN/generateAndCompare.sh, anticipate the checks for errors in codegen
 log before manipulating the generated code

---
 epochX/cudacpp/CODEGEN/generateAndCompare.sh | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/epochX/cudacpp/CODEGEN/generateAndCompare.sh b/epochX/cudacpp/CODEGEN/generateAndCompare.sh
index 1f416ac5b9..7519fbcc7d 100755
--- a/epochX/cudacpp/CODEGEN/generateAndCompare.sh
+++ b/epochX/cudacpp/CODEGEN/generateAndCompare.sh
@@ -201,6 +201,16 @@ function codeGenAndDiff()
     cat ${outproc}_log.txt | egrep -v '(Crash Annotation)' > ${outproc}_log.txt.new # remove firefox 'glxtest: libEGL initialize failed' errors
     \mv ${outproc}_log.txt.new ${outproc}_log.txt
   fi
+  # Check the code generation log for errors 
+  if [ -d ${outproc} ] && ! grep -q "Please report this bug" ${outproc}_log.txt; then
+    ###cat ${outproc}_log.txt; exit 0 # FOR DEBUGGING
+    cat ${MG5AMC_HOME}/${outproc}_log.txt | { egrep 'INFO: (Try|Creat|Organiz|Process)' || true; }
+  else
+    echo "*** ERROR! Code generation failed"
+    cat ${MG5AMC_HOME}/${outproc}_log.txt
+    echo "*** ERROR! Code generation failed"
+    exit 1
+  fi
   # Patches moved here from patchMad.sh after Olivier's PR #764 (THIS IS ONLY NEEDED IN THE MADGRAPH4GPU GIT REPO)  
   if [ "${OUTBCK}" == "mad" ]; then
     # Force the use of strategy SDE=1 in multichannel mode (see #419)
@@ -244,16 +254,6 @@ function codeGenAndDiff()
 CPP = cudacpp_backend ! valid backends are FORTRAN, CPP, CUDA" >> ${outproc}/Cards/run_card.dat
     fi
   fi
-  # Check the code generation log for errors 
-  if [ -d ${outproc} ] && ! grep -q "Please report this bug" ${outproc}_log.txt; then
-    ###cat ${outproc}_log.txt; exit 0 # FOR DEBUGGING
-    cat ${MG5AMC_HOME}/${outproc}_log.txt | { egrep 'INFO: (Try|Creat|Organiz|Process)' || true; }
-  else
-    echo "*** ERROR! Code generation failed"
-    cat ${MG5AMC_HOME}/${outproc}_log.txt
-    echo "*** ERROR! Code generation failed"
-    exit 1
-  fi
   popd >& /dev/null
   # Choose which directory must be copied (for gridpack generation: untar and modify the gridpack)
   if [ "${SCRBCK}" == "gridpack" ]; then

From 4513bae1730e01cb3087075c59531b377f759bf8 Mon Sep 17 00:00:00 2001
From: Andrea Valassi <andrea.valassi@cern.ch>
Date: Thu, 9 Nov 2023 13:46:50 +0100
Subject: [PATCH 04/28] [floating_type_interface] in CODEGEN launch_plugin.py,
 minor fix (and some cleanup) in PR #788

---
 .../PLUGIN/CUDACPP_SA_OUTPUT/launch_plugin.py | 19 +------------------
 1 file changed, 1 insertion(+), 18 deletions(-)

diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/launch_plugin.py b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/launch_plugin.py
index 120d30304b..acc5826fdb 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/launch_plugin.py
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/launch_plugin.py
@@ -29,8 +29,7 @@ def compile(self, *args, **opts):
         """ """
         import multiprocessing
         if not self.options['nb_core'] or self.options['nb_core'] == 'None':
-            self.options['nb_core'] = multiprocessing.cpu_count()
-    
+            self.options['nb_core'] = multiprocessing.cpu_count()    
         if 'cwd' in opts and os.path.basename(opts['cwd']) == 'Source':
             path = pjoin(opts['cwd'], 'make_opts')
             avx_type = self.run_card['avx_type'] if self.run_card['avx_type'] != 'auto' else ''
@@ -38,8 +37,6 @@ def compile(self, *args, **opts):
                 {'FPTYPE': self.run_card['floating_type'],
                  'AVX':  avx_type })
             misc.sprint('FPTYPE checked')
-
-
         if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'):            
             cudacpp_backend = self.run_card['cudacpp_backend'].upper() # the default value is defined in banner.py
             logger.info("Building madevent in madevent_interface.py with '%s' matrix elements"%cudacpp_backend)
@@ -55,7 +52,6 @@ def compile(self, *args, **opts):
         else:
             return misc.compile(nb_core=self.options['nb_core'], *args, **opts)
 
-
 # Phase-Space Optimization ------------------------------------------------------------------------------------
 template_on = \
 """#*********************************************************************
@@ -67,12 +63,9 @@ def compile(self, *args, **opts):
 """
 
 template_off = ''
-
 plugin_block = banner_mod.RunBlock('simd', template_on=template_on, template_off=template_off)
 
-
 class CPPRunCard(banner_mod.RunCardLO):
-
     blocks = banner_mod.RunCardLO.blocks + [plugin_block]
 
     def reset_simd(self, old_value, new_value, name):
@@ -87,12 +80,9 @@ def reset_simd(self, old_value, new_value, name):
         subprocess.call(['make', 'cleanavx'], cwd=Sourcedir, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
 
     def reset_makeopts(self, old_value, new_value, name):
-
         if not hasattr(self, 'path'):
             raise Exception
-
         avx_value = self['avx_type'] if self['avx_type'] != 'auto' else ''
-
         if name == 'floating_type':
             common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': new_value, 'AVX': avx_value})
         elif name == 'avx_type':
@@ -101,7 +91,6 @@ def reset_makeopts(self, old_value, new_value, name):
             common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': self['floating_type'], 'AVX': new_value})
         else:
             raise Exception
-
         Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source')
         subprocess.call(['make', 'cleanavx'], cwd=Sourcedir, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
 
@@ -124,9 +113,6 @@ def default_setup(self):
         self.display_block.append('simd')
         self.display_block.append('psoptim')
 
-
->>>>>>> 746f16476 (better formatting for the card from the start and adding the possiblitity to choose avx)
-
     def write_one_include_file(self, output_dir, incname, output_file=None):
         """write one include file at the time"""
         if incname == "vector.inc" and 'vector_size' not in self.user_set:
@@ -143,14 +129,11 @@ def check_validity(self):
             self['hel_recycling'] = False
 
 class GPURunCard(CPPRunCard):
-
     def default_setup(self):
-
         super().default_setup()
         # change default value:
         self['cudacpp_backend'] = 'CUDA'
         self['vector_size'] = 16384 # already setup in default class (just change value)
 
-
 MEINTERFACE = CPPMEInterface
 RunCard = CPPRunCard

From 484e35f05ba19a2e740bbe983ed0cfed3d479236 Mon Sep 17 00:00:00 2001
From: Andrea Valassi <andrea.valassi@cern.ch>
Date: Thu, 9 Nov 2023 14:00:51 +0100
Subject: [PATCH 05/28] [floating_type_interface] in
 .github/workflows/testsuite_oneprocess.sh, add some debug printouts about
 comparison of generated code

---
 .github/workflows/testsuite_oneprocess.sh | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/testsuite_oneprocess.sh b/.github/workflows/testsuite_oneprocess.sh
index 639991f1cb..ea3f294fc8 100755
--- a/.github/workflows/testsuite_oneprocess.sh
+++ b/.github/workflows/testsuite_oneprocess.sh
@@ -38,16 +38,20 @@ function codegen() {
   ###compare=true # enable comparison to current git repo
   compare=false # disable comparison to current git repo
   if [ ${compare} ] && [ "$(git ls-tree --name-only HEAD ${proc})" != "" ]; then
+    echo
+    echo "Compare newly generated code for ${proc} to that in the madgraph4gpu github repository"
     git checkout HEAD ${proc}/CODEGEN*.txt
     if [ "${proc%.mad}" != "${proc}" ]; then
       git checkout HEAD ${proc}/Cards/me5_configuration.txt
       ###sed -i 's/DEFAULT_F2PY_COMPILER=f2py.*/DEFAULT_F2PY_COMPILER=f2py3/' ${proc}/Source/make_opts
       git checkout HEAD ${proc}/Source/make_opts
     fi
-    echo
     echo "git diff (start)"
     git diff --exit-code
     echo "git diff (end)"
+  else
+    echo
+    echo "(SKIP comparison of newly generated code for ${proc} to that in the madgraph4gpu github repository)"
   fi
 }
 

From fb22309b502c80a0a43755e81e018da572cbea0e Mon Sep 17 00:00:00 2001
From: Andrea Valassi <andrea.valassi@cern.ch>
Date: Thu, 9 Nov 2023 14:13:15 +0100
Subject: [PATCH 06/28] [floating_type_interface] in
 .github/workflows/testsuite_allprocesses.yml, disable on:push triggers to
 avoid launching two jobs instead of one

---
 .github/workflows/testsuite_allprocesses.yml | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/testsuite_allprocesses.yml b/.github/workflows/testsuite_allprocesses.yml
index 7eaad09c9f..1d67a9efaf 100644
--- a/.github/workflows/testsuite_allprocesses.yml
+++ b/.github/workflows/testsuite_allprocesses.yml
@@ -19,8 +19,9 @@ on:
     branches: [ master ]
 
   # Trigger the all-processes workflow when new changes to the workflow are pushed
-  push:
-    paths: [ .github/workflows/testsuite* ]
+  # (NB: this is now disabled to avoid triggering two jobs when pushing to a branch for which a PR is opened)
+  ###push:
+  ###  paths: [ .github/workflows/testsuite* ]
 
 #----------------------------------------------------------------------------------------------------------------------------------
 

From 437f9569686e43862be37c1020b5cb67ca83cb3e Mon Sep 17 00:00:00 2001
From: Andrea Valassi <andrea.valassi@cern.ch>
Date: Thu, 9 Nov 2023 14:17:14 +0100
Subject: [PATCH 07/28] [floating_type_interface] in
 .github/workflows/testsuite_oneprocess.sh, fix a bash bug and disable
 comparisons to the existing repo

---
 .github/workflows/testsuite_oneprocess.sh | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/testsuite_oneprocess.sh b/.github/workflows/testsuite_oneprocess.sh
index ea3f294fc8..d49d70c200 100755
--- a/.github/workflows/testsuite_oneprocess.sh
+++ b/.github/workflows/testsuite_oneprocess.sh
@@ -35,9 +35,9 @@ function codegen() {
     ./CODEGEN/generateAndCompare.sh -q ${proc%.sa}
   fi
   # Check if there are any differences to the current repo
-  ###compare=true # enable comparison to current git repo
-  compare=false # disable comparison to current git repo
-  if [ ${compare} ] && [ "$(git ls-tree --name-only HEAD ${proc})" != "" ]; then
+  ###compare=1 # enable comparison to current git repo
+  compare=0 # disable comparison to current git repo
+  if [ "${compare}" != "0" ] && [ "$(git ls-tree --name-only HEAD ${proc})" != "" ]; then
     echo
     echo "Compare newly generated code for ${proc} to that in the madgraph4gpu github repository"
     git checkout HEAD ${proc}/CODEGEN*.txt

From 7f50f2f685802e388140d988d1d6677495e0c89d Mon Sep 17 00:00:00 2001
From: Andrea Valassi <andrea.valassi@cern.ch>
Date: Fri, 10 Nov 2023 07:26:51 +0100
Subject: [PATCH 08/28] [floating_type_interface] revert Olivier's changes to
 MG5AMC before merging master

git checkout c803581e57ed157a108c71e3887930bd3338c73d ../../MG5aMC
git submodule update
---
 MG5aMC/mg5amcnlo | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/MG5aMC/mg5amcnlo b/MG5aMC/mg5amcnlo
index 8a18cc2423..49c93e01b8 160000
--- a/MG5aMC/mg5amcnlo
+++ b/MG5aMC/mg5amcnlo
@@ -1 +1 @@
-Subproject commit 8a18cc2423616ee91c4f9d74eec0cb2901e0fd2a
+Subproject commit 49c93e01b8596cbdb4e65f628601de1e6f08c744

From 5b4652b3813e41d4471364bc176a9a2a78e0010e Mon Sep 17 00:00:00 2001
From: Andrea Valassi <andrea.valassi@cern.ch>
Date: Wed, 15 Nov 2023 13:45:23 +0100
Subject: [PATCH 09/28] [floating_type_interface] regenerate gg_tt.mad, but
 this has a different vector.inc (where comments have disappeared) that I want
 to debug - will revert

---
 .../gg_tt.mad/CODEGEN_mad_gg_tt_log.txt       | 27 ++++---
 epochX/cudacpp/gg_tt.mad/Cards/run_card.dat   | 26 ++++++-
 .../gg_tt.mad/Cards/run_card_default.dat      | 26 ++++++-
 epochX/cudacpp/gg_tt.mad/Source/vector.inc    | 32 +--------
 .../gg_tt.mad/bin/internal/launch_plugin.py   | 71 +++++++++++++++----
 5 files changed, 122 insertions(+), 60 deletions(-)

diff --git a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt
index a1fa47508f..900f4f489d 100644
--- a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt
+++ b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt
@@ -62,7 +62,7 @@ generate g g > t t~
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.005694150924682617 [0m
+[1;32mDEBUG: model prefixing  takes 0.005335807800292969 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -162,10 +162,10 @@ Load PLUGIN.CUDACPP_OUTPUT
 [1mAddition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1;32mDEBUG:  cformat = [0m standalone_cudacpp [1;30m[export_cpp.py at line 3071][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 155][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 160][0m [0m
 INFO: initialize a new directory: CODEGEN_mad_gg_tt 
 INFO: remove old information in CODEGEN_mad_gg_tt 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 160][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 165][0m [0m
 [1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt [0m
 INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt 
 [1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards [0m
@@ -175,7 +175,7 @@ INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1
 INFO: Processing color information for process: g g > t t~ @1 
 INFO: Creating files in directory P1_gg_ttx 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f560fa1e7c0> [1;30m[export_v4.py at line 6262][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f38a2ae0f40> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -191,16 +191,16 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 INFO: Generating Feynman diagrams for Process: g g > t t~ WEIGHTED<=2 @1 
 INFO: Finding symmetric diagrams for subprocess group gg_ttx 
 Generated helas calls for 1 subprocesses (3 diagrams) in 0.006 s
-Wrote files for 10 helas calls in 0.101 s
+Wrote files for 10 helas calls in 0.099 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 set of routines with options: P0[0m
 ALOHA: aloha creates FFV1 routines[0m
-ALOHA: aloha creates 2 routines in  0.145 s
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 197][0m [0m
+ALOHA: aloha creates 2 routines in  0.144 s
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 202][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 set of routines with options: P0[0m
 ALOHA: aloha creates FFV1 routines[0m
-ALOHA: aloha creates 4 routines in  0.132 s
+ALOHA: aloha creates 4 routines in  0.129 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -219,7 +219,6 @@ save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CO
 INFO: Use Fortran compiler gfortran 
 INFO: Use c++ compiler g++ 
 INFO: Generate web pages 
-[1;32mDEBUG:  'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) = [0m Entering PLUGIN_ProcessExporter.finalize True <class 'PLUGIN.CUDACPP_OUTPUT.output.PLUGIN_ProcessExporter'> [1;30m[output.py at line 206][0m [0m
 DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common
 patching file Source/genps.inc
 patching file Source/makefile
@@ -231,16 +230,16 @@ DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/S
 patching file auto_dsig1.f
 patching file driver.f
 patching file matrix1.f
-[1;32mDEBUG:  p.returncode = [0m 0 [1;30m[output.py at line 232][0m [0m
+[1;32mDEBUG:  p.returncode = [0m 0 [1;30m[output.py at line 237][0m [0m
 Output to directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt done.
 Type "launch" to generate events from this process, or see
 /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/README
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m4.772s
-user	0m1.470s
-sys	0m0.223s
+real	0m1.670s
+user	0m1.458s
+sys	0m0.213s
 ************************************************************
 *                                                          *
 *                      W E L C O M E to                    *
@@ -268,8 +267,6 @@ Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 No valid web browser found. Please set in ./input/mg5_configuration.txt
 treatcards run
-[1;34mrun_card missed argument cudacpp_backend. Takes default: CPP[0m
-[1;34mrun_card missed argument cudacpp_backend. Takes default: CPP[0m
 [1;34mWARNING! CPPRunCard instance has no attribute path[0m
 quit
 INFO:  
diff --git a/epochX/cudacpp/gg_tt.mad/Cards/run_card.dat b/epochX/cudacpp/gg_tt.mad/Cards/run_card.dat
index 80d07f317e..46d89bbc4f 100644
--- a/epochX/cudacpp/gg_tt.mad/Cards/run_card.dat
+++ b/epochX/cudacpp/gg_tt.mad/Cards/run_card.dat
@@ -80,7 +80,23 @@
    1  = sde_strategy  ! default integration strategy (hep-ph/2021.00773)
                              ! 1 is old strategy (using amp square)
 			     ! 2 is new strategy (using only the denominator)
-# To see advanced option for Phase-Space optimization: type "update psoptim"			     
+#*********************************************************************
+# Phase-Space Optim (advanced)
+#*********************************************************************
+   0 = job_strategy ! see appendix of 1507.00020 (page 26)
+   0 =  hard_survey ! force to have better estimate of the integral at survey for difficult mode like interference
+   -1.0 = tmin_for_channel ! limit the non-singular reach of --some-- channel of integration related to T-channel diagram (value between -1 and 0), -1 is no impact
+   -1 = survey_splitting ! for loop-induced control how many core are used at survey for the computation of a single iteration.
+   2 = survey_nchannel_per_job ! control how many Channel are integrated inside a single job on cluster/multicore
+   -1 = refine_evt_by_job ! control the maximal number of events for the first iteration of the refine (larger means less jobs)  
+#*********************************************************************
+# Compilation flag. 
+#*********************************************************************   
+   -O = global_flag ! fortran optimization flag use for the all code.
+   --fast-math  = aloha_flag ! fortran optimization flag for aloha function. Suggestions: '-ffast-math'
+   -O3 = matrix_flag ! fortran optimization flag for matrix.f function. Suggestions: '-O3'
+   16 = vector_size ! size designed for SIMD/OpenMP/GPU (number of events in lockstep)
+			     
 #*********************************************************************
 # Customization (custom cuts/scale/bias/...)                         *
 # list of files containing fortran function that overwrite default   *
@@ -140,6 +156,14 @@
 systematics = systematics_program ! none, systematics [python], SysCalc [depreceted, C++]
 ['--mur=0.5,1,2', '--muf=0.5,1,2', '--pdf=errorset'] = systematics_arguments ! see: https://cp3.irmp.ucl.ac.be/projects/madgraph/wiki/Systematics#Systematicspythonmodule
 
+#*********************************************************************
+# SIMD/GPU Parametrization
+#*********************************************************************
+   d = floating_type ! single precision(f), double precision (d), mixed (m) [double for amplitude, single for color]
+   auto =  avx_type  ! for SIMD, technology to use for the vectorization
+   CPP = cudacpp_backend ! Fortran/CPP/CUDA switch mode to use
+
+
 #*********************************************************************
 # Options for the cudacpp plugin
 #*********************************************************************
diff --git a/epochX/cudacpp/gg_tt.mad/Cards/run_card_default.dat b/epochX/cudacpp/gg_tt.mad/Cards/run_card_default.dat
index 48d84c73f0..7697a3f769 100644
--- a/epochX/cudacpp/gg_tt.mad/Cards/run_card_default.dat
+++ b/epochX/cudacpp/gg_tt.mad/Cards/run_card_default.dat
@@ -80,7 +80,23 @@
    2  = sde_strategy  ! default integration strategy (hep-ph/2021.00773)
                              ! 1 is old strategy (using amp square)
 			     ! 2 is new strategy (using only the denominator)
-# To see advanced option for Phase-Space optimization: type "update psoptim"			     
+#*********************************************************************
+# Phase-Space Optim (advanced)
+#*********************************************************************
+   0 = job_strategy ! see appendix of 1507.00020 (page 26)
+   0 =  hard_survey ! force to have better estimate of the integral at survey for difficult mode like interference
+   -1.0 = tmin_for_channel ! limit the non-singular reach of --some-- channel of integration related to T-channel diagram (value between -1 and 0), -1 is no impact
+   -1 = survey_splitting ! for loop-induced control how many core are used at survey for the computation of a single iteration.
+   2 = survey_nchannel_per_job ! control how many Channel are integrated inside a single job on cluster/multicore
+   -1 = refine_evt_by_job ! control the maximal number of events for the first iteration of the refine (larger means less jobs)  
+#*********************************************************************
+# Compilation flag. 
+#*********************************************************************   
+   -O = global_flag ! fortran optimization flag use for the all code.
+   --fast-math  = aloha_flag ! fortran optimization flag for aloha function. Suggestions: '-ffast-math'
+   -O3 = matrix_flag ! fortran optimization flag for matrix.f function. Suggestions: '-O3'
+   16 = vector_size ! size designed for SIMD/OpenMP/GPU (number of events in lockstep)
+			     
 #*********************************************************************
 # Customization (custom cuts/scale/bias/...)                         *
 # list of files containing fortran function that overwrite default   *
@@ -139,3 +155,11 @@
 #
 systematics = systematics_program ! none, systematics [python], SysCalc [depreceted, C++]
 ['--mur=0.5,1,2', '--muf=0.5,1,2', '--pdf=errorset'] = systematics_arguments ! see: https://cp3.irmp.ucl.ac.be/projects/madgraph/wiki/Systematics#Systematicspythonmodule
+
+#*********************************************************************
+# SIMD/GPU Parametrization
+#*********************************************************************
+   d = floating_type ! single precision(f), double precision (d), mixed (m) [double for amplitude, single for color]
+   auto =  avx_type  ! for SIMD, technology to use for the vectorization
+   CPP = cudacpp_backend ! Fortran/CPP/CUDA switch mode to use
+
diff --git a/epochX/cudacpp/gg_tt.mad/Source/vector.inc b/epochX/cudacpp/gg_tt.mad/Source/vector.inc
index 863eebbc70..9eab485b2b 100644
--- a/epochX/cudacpp/gg_tt.mad/Source/vector.inc
+++ b/epochX/cudacpp/gg_tt.mad/Source/vector.inc
@@ -1,31 +1,3 @@
-C     
-C     If VECSIZE_MEMMAX is greater than 1, a vector API is used:
-C     this is designed for offloading MEs to GPUs or vectorized C++,
-C     but it can also be used for computing MEs in Fortran.
-C     If VECSIZE_MEMMAX equals 1, the old scalar API is used:
-C     this can only be used for computing MEs in Fortran.
-C     
-C     Fortran arrays in the vector API can hold up to VECSIZE_MEMMAX
-C     events and are statically allocated at compile time.
-C     The constant value of VECSIZE_MEMMAX is fixed at codegen time
-C     (output madevent ... --vector_size=<VECSIZE_MEMMAX>).
-C     
-C     While the arrays can hold up to VECSIZE_MEMMAX events,
-C     only VECSIZE_USED (<= VECSIZE_MEMAMX) are used in Fortran loops.
-C     The value of VECSIZE_USED can be chosen at runtime
-C     (typically 8k-16k for GPUs, 16-32 for vectorized C++).
-C     
-C     The value of VECSIZE_USED represents the number of events
-C     handled by one call to the Fortran/cudacpp "bridge".
-C     This is not necessarily the number of events which are
-C     processed in lockstep within a single SIMD vector on CPUs
-C     or within a single "warp" of threads on GPUs. These parameters
-C     are internal to the cudacpp bridge and need not be exposed
-C     to the Fortran program which calls the cudacpp bridge.
-C     
-C     NB: THIS FILE CANNOT CONTAIN #ifdef DIRECTIVES
-C     BECAUSE IT DOES NOT GO THROUGH THE CPP PREPROCESSOR
-C     (see https://github.com/madgraph5/madgraph4gpu/issues/458).
-C     
       INTEGER VECSIZE_MEMMAX
-      PARAMETER (VECSIZE_MEMMAX=16384)
+
+      PARAMETER (VECSIZE_MEMMAX=16)
diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/gg_tt.mad/bin/internal/launch_plugin.py
index 0b849330ef..acc5826fdb 100644
--- a/epochX/cudacpp/gg_tt.mad/bin/internal/launch_plugin.py
+++ b/epochX/cudacpp/gg_tt.mad/bin/internal/launch_plugin.py
@@ -16,22 +16,28 @@
     import internal.misc as misc
     import internal.extended_cmd as extended_cmd
     import internal.banner as banner_mod
+    import internal.common_run_interface as common_run_interface
 else:
     import madgraph.interface.madevent_interface as madevent_interface
     import madgraph.various.misc as misc
     import madgraph.interface.extended_cmd as extended_cmd
     import madgraph.various.banner as banner_mod
+    import madgraph.interface.common_run_interface as common_run_interface
 
 class CPPMEInterface(madevent_interface.MadEventCmdShell):
     def compile(self, *args, **opts):
         """ """
         import multiprocessing
         if not self.options['nb_core'] or self.options['nb_core'] == 'None':
-            self.options['nb_core'] = multiprocessing.cpu_count()
-        if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'):
-            import pathlib
-            import os
-            pjoin = os.path.join
+            self.options['nb_core'] = multiprocessing.cpu_count()    
+        if 'cwd' in opts and os.path.basename(opts['cwd']) == 'Source':
+            path = pjoin(opts['cwd'], 'make_opts')
+            avx_type = self.run_card['avx_type'] if self.run_card['avx_type'] != 'auto' else ''
+            common_run_interface.CommonRunCmd.update_make_opts_full(path,
+                {'FPTYPE': self.run_card['floating_type'],
+                 'AVX':  avx_type })
+            misc.sprint('FPTYPE checked')
+        if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'):            
             cudacpp_backend = self.run_card['cudacpp_backend'].upper() # the default value is defined in banner.py
             logger.info("Building madevent in madevent_interface.py with '%s' matrix elements"%cudacpp_backend)
             if cudacpp_backend == 'FORTRAN':
@@ -46,7 +52,22 @@ def compile(self, *args, **opts):
         else:
             return misc.compile(nb_core=self.options['nb_core'], *args, **opts)
 
+# Phase-Space Optimization ------------------------------------------------------------------------------------
+template_on = \
+"""#*********************************************************************
+# SIMD/GPU Parametrization
+#*********************************************************************
+   %(floating_type)s = floating_type ! single precision(f), double precision (d), mixed (m) [double for amplitude, single for color]
+   %(avx_type)s =  avx_type  ! for SIMD, technology to use for the vectorization
+   %(cudacpp_backend)s = cudacpp_backend ! Fortran/CPP/CUDA switch mode to use
+"""
+
+template_off = ''
+plugin_block = banner_mod.RunBlock('simd', template_on=template_on, template_off=template_off)
+
 class CPPRunCard(banner_mod.RunCardLO):
+    blocks = banner_mod.RunCardLO.blocks + [plugin_block]
+
     def reset_simd(self, old_value, new_value, name):
         if not hasattr(self, 'path'):
             logger.warning('WARNING! CPPRunCard instance has no attribute path')
@@ -58,12 +79,39 @@ def reset_simd(self, old_value, new_value, name):
         Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source')
         subprocess.call(['make', 'cleanavx'], cwd=Sourcedir, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
 
+    def reset_makeopts(self, old_value, new_value, name):
+        if not hasattr(self, 'path'):
+            raise Exception
+        avx_value = self['avx_type'] if self['avx_type'] != 'auto' else ''
+        if name == 'floating_type':
+            common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': new_value, 'AVX': avx_value})
+        elif name == 'avx_type':
+            if new_value == 'Auto':
+                new_value = ''
+            common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': self['floating_type'], 'AVX': new_value})
+        else:
+            raise Exception
+        Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source')
+        subprocess.call(['make', 'cleanavx'], cwd=Sourcedir, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+
     def plugin_input(self, finput):
         return
 
     def default_setup(self):
         super().default_setup()
-        self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False)
+        self.add_param('floating_type', 'd', include=False, hidden=False,
+                       fct_mod=(self.reset_makeopts,(),{}),
+                       allowed=['m','d','f'])
+        self.add_param('avx_type', 'auto', include=False, hidden=False,
+                       fct_mod=(self.reset_makeopts,(),{}),
+                       allowed=['auto', 'none', 'sse4', 'avx2','512y','512z'])
+        self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False,
+                       allowed=['Fortan', 'CPP', 'CUDA'])
+        self['vector_size'] = 16 # already setup in default class (just change value)
+        self['aloha_flag'] = '--fast-math'
+        self['matrix_flag'] = '-O3'
+        self.display_block.append('simd')
+        self.display_block.append('psoptim')
 
     def write_one_include_file(self, output_dir, incname, output_file=None):
         """write one include file at the time"""
@@ -82,13 +130,10 @@ def check_validity(self):
 
 class GPURunCard(CPPRunCard):
     def default_setup(self):
-        super(CPPRunCard, self).default_setup()
-        self.add_param('cudacpp_backend', 'CUDA', include=False, hidden=False)
-
-#class CUDACPPRunCard(CPPRunCard):
-#    def default_setup(self):
-#        super(CPPRunCard, self).default_setup()
-#        self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False)
+        super().default_setup()
+        # change default value:
+        self['cudacpp_backend'] = 'CUDA'
+        self['vector_size'] = 16384 # already setup in default class (just change value)
 
 MEINTERFACE = CPPMEInterface
 RunCard = CPPRunCard

From 7ed73c3272ccd5126d6a5468ea3956e4416fa919 Mon Sep 17 00:00:00 2001
From: Andrea Valassi <andrea.valassi@cern.ch>
Date: Wed, 15 Nov 2023 13:46:06 +0100
Subject: [PATCH 10/28] [floating_type_interface] revert to the older
 gg_tt.mad, while I debug vector.inc

Revert "[floating_type_interface] regenerate gg_tt.mad, but this has a different vector.inc (where comments have disappeared) that I want to debug - will revert"
This reverts commit 5b4652b3813e41d4471364bc176a9a2a78e0010e
---
 .../gg_tt.mad/CODEGEN_mad_gg_tt_log.txt       | 27 +++----
 epochX/cudacpp/gg_tt.mad/Cards/run_card.dat   | 26 +------
 .../gg_tt.mad/Cards/run_card_default.dat      | 26 +------
 epochX/cudacpp/gg_tt.mad/Source/vector.inc    | 32 ++++++++-
 .../gg_tt.mad/bin/internal/launch_plugin.py   | 71 ++++---------------
 5 files changed, 60 insertions(+), 122 deletions(-)

diff --git a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt
index 900f4f489d..a1fa47508f 100644
--- a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt
+++ b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt
@@ -62,7 +62,7 @@ generate g g > t t~
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.005335807800292969 [0m
+[1;32mDEBUG: model prefixing  takes 0.005694150924682617 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -162,10 +162,10 @@ Load PLUGIN.CUDACPP_OUTPUT
 [1mAddition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1;32mDEBUG:  cformat = [0m standalone_cudacpp [1;30m[export_cpp.py at line 3071][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 160][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 155][0m [0m
 INFO: initialize a new directory: CODEGEN_mad_gg_tt 
 INFO: remove old information in CODEGEN_mad_gg_tt 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 165][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 160][0m [0m
 [1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt [0m
 INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt 
 [1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards [0m
@@ -175,7 +175,7 @@ INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1
 INFO: Processing color information for process: g g > t t~ @1 
 INFO: Creating files in directory P1_gg_ttx 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f38a2ae0f40> [1;30m[export_v4.py at line 6262][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f560fa1e7c0> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -191,16 +191,16 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 INFO: Generating Feynman diagrams for Process: g g > t t~ WEIGHTED<=2 @1 
 INFO: Finding symmetric diagrams for subprocess group gg_ttx 
 Generated helas calls for 1 subprocesses (3 diagrams) in 0.006 s
-Wrote files for 10 helas calls in 0.099 s
+Wrote files for 10 helas calls in 0.101 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 set of routines with options: P0[0m
 ALOHA: aloha creates FFV1 routines[0m
-ALOHA: aloha creates 2 routines in  0.144 s
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 202][0m [0m
+ALOHA: aloha creates 2 routines in  0.145 s
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 197][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 set of routines with options: P0[0m
 ALOHA: aloha creates FFV1 routines[0m
-ALOHA: aloha creates 4 routines in  0.129 s
+ALOHA: aloha creates 4 routines in  0.132 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -219,6 +219,7 @@ save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CO
 INFO: Use Fortran compiler gfortran 
 INFO: Use c++ compiler g++ 
 INFO: Generate web pages 
+[1;32mDEBUG:  'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) = [0m Entering PLUGIN_ProcessExporter.finalize True <class 'PLUGIN.CUDACPP_OUTPUT.output.PLUGIN_ProcessExporter'> [1;30m[output.py at line 206][0m [0m
 DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common
 patching file Source/genps.inc
 patching file Source/makefile
@@ -230,16 +231,16 @@ DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/S
 patching file auto_dsig1.f
 patching file driver.f
 patching file matrix1.f
-[1;32mDEBUG:  p.returncode = [0m 0 [1;30m[output.py at line 237][0m [0m
+[1;32mDEBUG:  p.returncode = [0m 0 [1;30m[output.py at line 232][0m [0m
 Output to directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt done.
 Type "launch" to generate events from this process, or see
 /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/README
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m1.670s
-user	0m1.458s
-sys	0m0.213s
+real	0m4.772s
+user	0m1.470s
+sys	0m0.223s
 ************************************************************
 *                                                          *
 *                      W E L C O M E to                    *
@@ -267,6 +268,8 @@ Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 No valid web browser found. Please set in ./input/mg5_configuration.txt
 treatcards run
+[1;34mrun_card missed argument cudacpp_backend. Takes default: CPP[0m
+[1;34mrun_card missed argument cudacpp_backend. Takes default: CPP[0m
 [1;34mWARNING! CPPRunCard instance has no attribute path[0m
 quit
 INFO:  
diff --git a/epochX/cudacpp/gg_tt.mad/Cards/run_card.dat b/epochX/cudacpp/gg_tt.mad/Cards/run_card.dat
index 46d89bbc4f..80d07f317e 100644
--- a/epochX/cudacpp/gg_tt.mad/Cards/run_card.dat
+++ b/epochX/cudacpp/gg_tt.mad/Cards/run_card.dat
@@ -80,23 +80,7 @@
    1  = sde_strategy  ! default integration strategy (hep-ph/2021.00773)
                              ! 1 is old strategy (using amp square)
 			     ! 2 is new strategy (using only the denominator)
-#*********************************************************************
-# Phase-Space Optim (advanced)
-#*********************************************************************
-   0 = job_strategy ! see appendix of 1507.00020 (page 26)
-   0 =  hard_survey ! force to have better estimate of the integral at survey for difficult mode like interference
-   -1.0 = tmin_for_channel ! limit the non-singular reach of --some-- channel of integration related to T-channel diagram (value between -1 and 0), -1 is no impact
-   -1 = survey_splitting ! for loop-induced control how many core are used at survey for the computation of a single iteration.
-   2 = survey_nchannel_per_job ! control how many Channel are integrated inside a single job on cluster/multicore
-   -1 = refine_evt_by_job ! control the maximal number of events for the first iteration of the refine (larger means less jobs)  
-#*********************************************************************
-# Compilation flag. 
-#*********************************************************************   
-   -O = global_flag ! fortran optimization flag use for the all code.
-   --fast-math  = aloha_flag ! fortran optimization flag for aloha function. Suggestions: '-ffast-math'
-   -O3 = matrix_flag ! fortran optimization flag for matrix.f function. Suggestions: '-O3'
-   16 = vector_size ! size designed for SIMD/OpenMP/GPU (number of events in lockstep)
-			     
+# To see advanced option for Phase-Space optimization: type "update psoptim"			     
 #*********************************************************************
 # Customization (custom cuts/scale/bias/...)                         *
 # list of files containing fortran function that overwrite default   *
@@ -156,14 +140,6 @@
 systematics = systematics_program ! none, systematics [python], SysCalc [depreceted, C++]
 ['--mur=0.5,1,2', '--muf=0.5,1,2', '--pdf=errorset'] = systematics_arguments ! see: https://cp3.irmp.ucl.ac.be/projects/madgraph/wiki/Systematics#Systematicspythonmodule
 
-#*********************************************************************
-# SIMD/GPU Parametrization
-#*********************************************************************
-   d = floating_type ! single precision(f), double precision (d), mixed (m) [double for amplitude, single for color]
-   auto =  avx_type  ! for SIMD, technology to use for the vectorization
-   CPP = cudacpp_backend ! Fortran/CPP/CUDA switch mode to use
-
-
 #*********************************************************************
 # Options for the cudacpp plugin
 #*********************************************************************
diff --git a/epochX/cudacpp/gg_tt.mad/Cards/run_card_default.dat b/epochX/cudacpp/gg_tt.mad/Cards/run_card_default.dat
index 7697a3f769..48d84c73f0 100644
--- a/epochX/cudacpp/gg_tt.mad/Cards/run_card_default.dat
+++ b/epochX/cudacpp/gg_tt.mad/Cards/run_card_default.dat
@@ -80,23 +80,7 @@
    2  = sde_strategy  ! default integration strategy (hep-ph/2021.00773)
                              ! 1 is old strategy (using amp square)
 			     ! 2 is new strategy (using only the denominator)
-#*********************************************************************
-# Phase-Space Optim (advanced)
-#*********************************************************************
-   0 = job_strategy ! see appendix of 1507.00020 (page 26)
-   0 =  hard_survey ! force to have better estimate of the integral at survey for difficult mode like interference
-   -1.0 = tmin_for_channel ! limit the non-singular reach of --some-- channel of integration related to T-channel diagram (value between -1 and 0), -1 is no impact
-   -1 = survey_splitting ! for loop-induced control how many core are used at survey for the computation of a single iteration.
-   2 = survey_nchannel_per_job ! control how many Channel are integrated inside a single job on cluster/multicore
-   -1 = refine_evt_by_job ! control the maximal number of events for the first iteration of the refine (larger means less jobs)  
-#*********************************************************************
-# Compilation flag. 
-#*********************************************************************   
-   -O = global_flag ! fortran optimization flag use for the all code.
-   --fast-math  = aloha_flag ! fortran optimization flag for aloha function. Suggestions: '-ffast-math'
-   -O3 = matrix_flag ! fortran optimization flag for matrix.f function. Suggestions: '-O3'
-   16 = vector_size ! size designed for SIMD/OpenMP/GPU (number of events in lockstep)
-			     
+# To see advanced option for Phase-Space optimization: type "update psoptim"			     
 #*********************************************************************
 # Customization (custom cuts/scale/bias/...)                         *
 # list of files containing fortran function that overwrite default   *
@@ -155,11 +139,3 @@
 #
 systematics = systematics_program ! none, systematics [python], SysCalc [depreceted, C++]
 ['--mur=0.5,1,2', '--muf=0.5,1,2', '--pdf=errorset'] = systematics_arguments ! see: https://cp3.irmp.ucl.ac.be/projects/madgraph/wiki/Systematics#Systematicspythonmodule
-
-#*********************************************************************
-# SIMD/GPU Parametrization
-#*********************************************************************
-   d = floating_type ! single precision(f), double precision (d), mixed (m) [double for amplitude, single for color]
-   auto =  avx_type  ! for SIMD, technology to use for the vectorization
-   CPP = cudacpp_backend ! Fortran/CPP/CUDA switch mode to use
-
diff --git a/epochX/cudacpp/gg_tt.mad/Source/vector.inc b/epochX/cudacpp/gg_tt.mad/Source/vector.inc
index 9eab485b2b..863eebbc70 100644
--- a/epochX/cudacpp/gg_tt.mad/Source/vector.inc
+++ b/epochX/cudacpp/gg_tt.mad/Source/vector.inc
@@ -1,3 +1,31 @@
+C     
+C     If VECSIZE_MEMMAX is greater than 1, a vector API is used:
+C     this is designed for offloading MEs to GPUs or vectorized C++,
+C     but it can also be used for computing MEs in Fortran.
+C     If VECSIZE_MEMMAX equals 1, the old scalar API is used:
+C     this can only be used for computing MEs in Fortran.
+C     
+C     Fortran arrays in the vector API can hold up to VECSIZE_MEMMAX
+C     events and are statically allocated at compile time.
+C     The constant value of VECSIZE_MEMMAX is fixed at codegen time
+C     (output madevent ... --vector_size=<VECSIZE_MEMMAX>).
+C     
+C     While the arrays can hold up to VECSIZE_MEMMAX events,
+C     only VECSIZE_USED (<= VECSIZE_MEMAMX) are used in Fortran loops.
+C     The value of VECSIZE_USED can be chosen at runtime
+C     (typically 8k-16k for GPUs, 16-32 for vectorized C++).
+C     
+C     The value of VECSIZE_USED represents the number of events
+C     handled by one call to the Fortran/cudacpp "bridge".
+C     This is not necessarily the number of events which are
+C     processed in lockstep within a single SIMD vector on CPUs
+C     or within a single "warp" of threads on GPUs. These parameters
+C     are internal to the cudacpp bridge and need not be exposed
+C     to the Fortran program which calls the cudacpp bridge.
+C     
+C     NB: THIS FILE CANNOT CONTAIN #ifdef DIRECTIVES
+C     BECAUSE IT DOES NOT GO THROUGH THE CPP PREPROCESSOR
+C     (see https://github.com/madgraph5/madgraph4gpu/issues/458).
+C     
       INTEGER VECSIZE_MEMMAX
-
-      PARAMETER (VECSIZE_MEMMAX=16)
+      PARAMETER (VECSIZE_MEMMAX=16384)
diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/gg_tt.mad/bin/internal/launch_plugin.py
index acc5826fdb..0b849330ef 100644
--- a/epochX/cudacpp/gg_tt.mad/bin/internal/launch_plugin.py
+++ b/epochX/cudacpp/gg_tt.mad/bin/internal/launch_plugin.py
@@ -16,28 +16,22 @@
     import internal.misc as misc
     import internal.extended_cmd as extended_cmd
     import internal.banner as banner_mod
-    import internal.common_run_interface as common_run_interface
 else:
     import madgraph.interface.madevent_interface as madevent_interface
     import madgraph.various.misc as misc
     import madgraph.interface.extended_cmd as extended_cmd
     import madgraph.various.banner as banner_mod
-    import madgraph.interface.common_run_interface as common_run_interface
 
 class CPPMEInterface(madevent_interface.MadEventCmdShell):
     def compile(self, *args, **opts):
         """ """
         import multiprocessing
         if not self.options['nb_core'] or self.options['nb_core'] == 'None':
-            self.options['nb_core'] = multiprocessing.cpu_count()    
-        if 'cwd' in opts and os.path.basename(opts['cwd']) == 'Source':
-            path = pjoin(opts['cwd'], 'make_opts')
-            avx_type = self.run_card['avx_type'] if self.run_card['avx_type'] != 'auto' else ''
-            common_run_interface.CommonRunCmd.update_make_opts_full(path,
-                {'FPTYPE': self.run_card['floating_type'],
-                 'AVX':  avx_type })
-            misc.sprint('FPTYPE checked')
-        if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'):            
+            self.options['nb_core'] = multiprocessing.cpu_count()
+        if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'):
+            import pathlib
+            import os
+            pjoin = os.path.join
             cudacpp_backend = self.run_card['cudacpp_backend'].upper() # the default value is defined in banner.py
             logger.info("Building madevent in madevent_interface.py with '%s' matrix elements"%cudacpp_backend)
             if cudacpp_backend == 'FORTRAN':
@@ -52,22 +46,7 @@ def compile(self, *args, **opts):
         else:
             return misc.compile(nb_core=self.options['nb_core'], *args, **opts)
 
-# Phase-Space Optimization ------------------------------------------------------------------------------------
-template_on = \
-"""#*********************************************************************
-# SIMD/GPU Parametrization
-#*********************************************************************
-   %(floating_type)s = floating_type ! single precision(f), double precision (d), mixed (m) [double for amplitude, single for color]
-   %(avx_type)s =  avx_type  ! for SIMD, technology to use for the vectorization
-   %(cudacpp_backend)s = cudacpp_backend ! Fortran/CPP/CUDA switch mode to use
-"""
-
-template_off = ''
-plugin_block = banner_mod.RunBlock('simd', template_on=template_on, template_off=template_off)
-
 class CPPRunCard(banner_mod.RunCardLO):
-    blocks = banner_mod.RunCardLO.blocks + [plugin_block]
-
     def reset_simd(self, old_value, new_value, name):
         if not hasattr(self, 'path'):
             logger.warning('WARNING! CPPRunCard instance has no attribute path')
@@ -79,39 +58,12 @@ def reset_simd(self, old_value, new_value, name):
         Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source')
         subprocess.call(['make', 'cleanavx'], cwd=Sourcedir, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
 
-    def reset_makeopts(self, old_value, new_value, name):
-        if not hasattr(self, 'path'):
-            raise Exception
-        avx_value = self['avx_type'] if self['avx_type'] != 'auto' else ''
-        if name == 'floating_type':
-            common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': new_value, 'AVX': avx_value})
-        elif name == 'avx_type':
-            if new_value == 'Auto':
-                new_value = ''
-            common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': self['floating_type'], 'AVX': new_value})
-        else:
-            raise Exception
-        Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source')
-        subprocess.call(['make', 'cleanavx'], cwd=Sourcedir, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
-
     def plugin_input(self, finput):
         return
 
     def default_setup(self):
         super().default_setup()
-        self.add_param('floating_type', 'd', include=False, hidden=False,
-                       fct_mod=(self.reset_makeopts,(),{}),
-                       allowed=['m','d','f'])
-        self.add_param('avx_type', 'auto', include=False, hidden=False,
-                       fct_mod=(self.reset_makeopts,(),{}),
-                       allowed=['auto', 'none', 'sse4', 'avx2','512y','512z'])
-        self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False,
-                       allowed=['Fortan', 'CPP', 'CUDA'])
-        self['vector_size'] = 16 # already setup in default class (just change value)
-        self['aloha_flag'] = '--fast-math'
-        self['matrix_flag'] = '-O3'
-        self.display_block.append('simd')
-        self.display_block.append('psoptim')
+        self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False)
 
     def write_one_include_file(self, output_dir, incname, output_file=None):
         """write one include file at the time"""
@@ -130,10 +82,13 @@ def check_validity(self):
 
 class GPURunCard(CPPRunCard):
     def default_setup(self):
-        super().default_setup()
-        # change default value:
-        self['cudacpp_backend'] = 'CUDA'
-        self['vector_size'] = 16384 # already setup in default class (just change value)
+        super(CPPRunCard, self).default_setup()
+        self.add_param('cudacpp_backend', 'CUDA', include=False, hidden=False)
+
+#class CUDACPPRunCard(CPPRunCard):
+#    def default_setup(self):
+#        super(CPPRunCard, self).default_setup()
+#        self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False)
 
 MEINTERFACE = CPPMEInterface
 RunCard = CPPRunCard

From 206d9efb66b15afda703e3b6f9d052a48e165a55 Mon Sep 17 00:00:00 2001
From: Andrea Valassi <andrea.valassi@cern.ch>
Date: Wed, 15 Nov 2023 15:54:21 +0100
Subject: [PATCH 11/28] [floating_type_interface] in CODEGEN launch_plugin.py,
 ensure that the comments in vector.inc are kept when 'madevent treatcards' is
 called

Note: vector.inc is initially created via export_v4.py, which is not available to treatcards.
Then vector.inc is modified by banner.py, whose write_one_include_file is overloaded in launch_plugin.py.
---
 .../PLUGIN/CUDACPP_SA_OUTPUT/launch_plugin.py | 20 ++++++++++++++++---
 1 file changed, 17 insertions(+), 3 deletions(-)

diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/launch_plugin.py b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/launch_plugin.py
index acc5826fdb..3527be0e2c 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/launch_plugin.py
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/launch_plugin.py
@@ -113,11 +113,25 @@ def default_setup(self):
         self.display_block.append('simd')
         self.display_block.append('psoptim')
 
+    # OM/AV - overload the default version in banner.py
     def write_one_include_file(self, output_dir, incname, output_file=None):
         """write one include file at the time"""
-        if incname == "vector.inc" and 'vector_size' not in self.user_set:
-            return
-        super().write_one_include_file(output_dir, incname, output_file)
+        if incname == "vector.inc":
+            if 'vector_size' not in self.user_set: return
+            if output_file is None: vectorinc=pjoin(output_dir,incname)
+            else: vectorinc=output_file
+            with open(vectorinc+'.new','w') as fileout:
+                with open(vectorinc) as filein:
+                    for line in filein:
+                        if line.startswith('C'): fileout.write(line)
+            super().write_one_include_file(output_dir, incname, output_file)
+            with open(vectorinc+'.new','a') as fileout:
+                with open(vectorinc) as filein:
+                    for line in filein:
+                        if not line.startswith('\n'): fileout.write(line)
+            os.replace(vectorinc+'.new',vectorinc)
+        else:
+            super().write_one_include_file(output_dir, incname, output_file)
 
     def check_validity(self):
         """ensure that PLUGIN information are consistent"""

From 64f2b176dbcff9d5859a6559b58d72ae5493f9dc Mon Sep 17 00:00:00 2001
From: Andrea Valassi <andrea.valassi@cern.ch>
Date: Wed, 15 Nov 2023 16:25:13 +0100
Subject: [PATCH 12/28] [floating_type_interface] regenerate gg_tt.mad, the
 only change in vector.inc is now that VECSIZE_MEMMAX is 16 instead of 16384

---
 .../gg_tt.mad/CODEGEN_mad_gg_tt_log.txt       | 27 +++---
 epochX/cudacpp/gg_tt.mad/Cards/run_card.dat   | 26 +++++-
 .../gg_tt.mad/Cards/run_card_default.dat      | 26 +++++-
 epochX/cudacpp/gg_tt.mad/Source/vector.inc    |  2 +-
 .../gg_tt.mad/bin/internal/launch_plugin.py   | 91 +++++++++++++++----
 5 files changed, 138 insertions(+), 34 deletions(-)

diff --git a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt
index a1fa47508f..bae9fd6d79 100644
--- a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt
+++ b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt
@@ -62,7 +62,7 @@ generate g g > t t~
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.005694150924682617 [0m
+[1;32mDEBUG: model prefixing  takes 0.005484580993652344 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -162,10 +162,10 @@ Load PLUGIN.CUDACPP_OUTPUT
 [1mAddition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1;32mDEBUG:  cformat = [0m standalone_cudacpp [1;30m[export_cpp.py at line 3071][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 155][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 160][0m [0m
 INFO: initialize a new directory: CODEGEN_mad_gg_tt 
 INFO: remove old information in CODEGEN_mad_gg_tt 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 160][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 165][0m [0m
 [1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt [0m
 INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt 
 [1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards [0m
@@ -175,7 +175,7 @@ INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1
 INFO: Processing color information for process: g g > t t~ @1 
 INFO: Creating files in directory P1_gg_ttx 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f560fa1e7c0> [1;30m[export_v4.py at line 6262][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f5131cf69d0> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -191,16 +191,16 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 INFO: Generating Feynman diagrams for Process: g g > t t~ WEIGHTED<=2 @1 
 INFO: Finding symmetric diagrams for subprocess group gg_ttx 
 Generated helas calls for 1 subprocesses (3 diagrams) in 0.006 s
-Wrote files for 10 helas calls in 0.101 s
+Wrote files for 10 helas calls in 0.100 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 set of routines with options: P0[0m
 ALOHA: aloha creates FFV1 routines[0m
-ALOHA: aloha creates 2 routines in  0.145 s
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 197][0m [0m
+ALOHA: aloha creates 2 routines in  0.142 s
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 202][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 set of routines with options: P0[0m
 ALOHA: aloha creates FFV1 routines[0m
-ALOHA: aloha creates 4 routines in  0.132 s
+ALOHA: aloha creates 4 routines in  0.130 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -219,7 +219,6 @@ save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CO
 INFO: Use Fortran compiler gfortran 
 INFO: Use c++ compiler g++ 
 INFO: Generate web pages 
-[1;32mDEBUG:  'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) = [0m Entering PLUGIN_ProcessExporter.finalize True <class 'PLUGIN.CUDACPP_OUTPUT.output.PLUGIN_ProcessExporter'> [1;30m[output.py at line 206][0m [0m
 DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common
 patching file Source/genps.inc
 patching file Source/makefile
@@ -231,16 +230,16 @@ DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/S
 patching file auto_dsig1.f
 patching file driver.f
 patching file matrix1.f
-[1;32mDEBUG:  p.returncode = [0m 0 [1;30m[output.py at line 232][0m [0m
+[1;32mDEBUG:  p.returncode = [0m 0 [1;30m[output.py at line 237][0m [0m
 Output to directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt done.
 Type "launch" to generate events from this process, or see
 /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/README
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m4.772s
-user	0m1.470s
-sys	0m0.223s
+real	0m1.677s
+user	0m1.472s
+sys	0m0.204s
 ************************************************************
 *                                                          *
 *                      W E L C O M E to                    *
@@ -268,8 +267,6 @@ Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 No valid web browser found. Please set in ./input/mg5_configuration.txt
 treatcards run
-[1;34mrun_card missed argument cudacpp_backend. Takes default: CPP[0m
-[1;34mrun_card missed argument cudacpp_backend. Takes default: CPP[0m
 [1;34mWARNING! CPPRunCard instance has no attribute path[0m
 quit
 INFO:  
diff --git a/epochX/cudacpp/gg_tt.mad/Cards/run_card.dat b/epochX/cudacpp/gg_tt.mad/Cards/run_card.dat
index 80d07f317e..46d89bbc4f 100644
--- a/epochX/cudacpp/gg_tt.mad/Cards/run_card.dat
+++ b/epochX/cudacpp/gg_tt.mad/Cards/run_card.dat
@@ -80,7 +80,23 @@
    1  = sde_strategy  ! default integration strategy (hep-ph/2021.00773)
                              ! 1 is old strategy (using amp square)
 			     ! 2 is new strategy (using only the denominator)
-# To see advanced option for Phase-Space optimization: type "update psoptim"			     
+#*********************************************************************
+# Phase-Space Optim (advanced)
+#*********************************************************************
+   0 = job_strategy ! see appendix of 1507.00020 (page 26)
+   0 =  hard_survey ! force to have better estimate of the integral at survey for difficult mode like interference
+   -1.0 = tmin_for_channel ! limit the non-singular reach of --some-- channel of integration related to T-channel diagram (value between -1 and 0), -1 is no impact
+   -1 = survey_splitting ! for loop-induced control how many core are used at survey for the computation of a single iteration.
+   2 = survey_nchannel_per_job ! control how many Channel are integrated inside a single job on cluster/multicore
+   -1 = refine_evt_by_job ! control the maximal number of events for the first iteration of the refine (larger means less jobs)  
+#*********************************************************************
+# Compilation flag. 
+#*********************************************************************   
+   -O = global_flag ! fortran optimization flag use for the all code.
+   --fast-math  = aloha_flag ! fortran optimization flag for aloha function. Suggestions: '-ffast-math'
+   -O3 = matrix_flag ! fortran optimization flag for matrix.f function. Suggestions: '-O3'
+   16 = vector_size ! size designed for SIMD/OpenMP/GPU (number of events in lockstep)
+			     
 #*********************************************************************
 # Customization (custom cuts/scale/bias/...)                         *
 # list of files containing fortran function that overwrite default   *
@@ -140,6 +156,14 @@
 systematics = systematics_program ! none, systematics [python], SysCalc [depreceted, C++]
 ['--mur=0.5,1,2', '--muf=0.5,1,2', '--pdf=errorset'] = systematics_arguments ! see: https://cp3.irmp.ucl.ac.be/projects/madgraph/wiki/Systematics#Systematicspythonmodule
 
+#*********************************************************************
+# SIMD/GPU Parametrization
+#*********************************************************************
+   d = floating_type ! single precision(f), double precision (d), mixed (m) [double for amplitude, single for color]
+   auto =  avx_type  ! for SIMD, technology to use for the vectorization
+   CPP = cudacpp_backend ! Fortran/CPP/CUDA switch mode to use
+
+
 #*********************************************************************
 # Options for the cudacpp plugin
 #*********************************************************************
diff --git a/epochX/cudacpp/gg_tt.mad/Cards/run_card_default.dat b/epochX/cudacpp/gg_tt.mad/Cards/run_card_default.dat
index 48d84c73f0..7697a3f769 100644
--- a/epochX/cudacpp/gg_tt.mad/Cards/run_card_default.dat
+++ b/epochX/cudacpp/gg_tt.mad/Cards/run_card_default.dat
@@ -80,7 +80,23 @@
    2  = sde_strategy  ! default integration strategy (hep-ph/2021.00773)
                              ! 1 is old strategy (using amp square)
 			     ! 2 is new strategy (using only the denominator)
-# To see advanced option for Phase-Space optimization: type "update psoptim"			     
+#*********************************************************************
+# Phase-Space Optim (advanced)
+#*********************************************************************
+   0 = job_strategy ! see appendix of 1507.00020 (page 26)
+   0 =  hard_survey ! force to have better estimate of the integral at survey for difficult mode like interference
+   -1.0 = tmin_for_channel ! limit the non-singular reach of --some-- channel of integration related to T-channel diagram (value between -1 and 0), -1 is no impact
+   -1 = survey_splitting ! for loop-induced control how many core are used at survey for the computation of a single iteration.
+   2 = survey_nchannel_per_job ! control how many Channel are integrated inside a single job on cluster/multicore
+   -1 = refine_evt_by_job ! control the maximal number of events for the first iteration of the refine (larger means less jobs)  
+#*********************************************************************
+# Compilation flag. 
+#*********************************************************************   
+   -O = global_flag ! fortran optimization flag use for the all code.
+   --fast-math  = aloha_flag ! fortran optimization flag for aloha function. Suggestions: '-ffast-math'
+   -O3 = matrix_flag ! fortran optimization flag for matrix.f function. Suggestions: '-O3'
+   16 = vector_size ! size designed for SIMD/OpenMP/GPU (number of events in lockstep)
+			     
 #*********************************************************************
 # Customization (custom cuts/scale/bias/...)                         *
 # list of files containing fortran function that overwrite default   *
@@ -139,3 +155,11 @@
 #
 systematics = systematics_program ! none, systematics [python], SysCalc [depreceted, C++]
 ['--mur=0.5,1,2', '--muf=0.5,1,2', '--pdf=errorset'] = systematics_arguments ! see: https://cp3.irmp.ucl.ac.be/projects/madgraph/wiki/Systematics#Systematicspythonmodule
+
+#*********************************************************************
+# SIMD/GPU Parametrization
+#*********************************************************************
+   d = floating_type ! single precision(f), double precision (d), mixed (m) [double for amplitude, single for color]
+   auto =  avx_type  ! for SIMD, technology to use for the vectorization
+   CPP = cudacpp_backend ! Fortran/CPP/CUDA switch mode to use
+
diff --git a/epochX/cudacpp/gg_tt.mad/Source/vector.inc b/epochX/cudacpp/gg_tt.mad/Source/vector.inc
index 863eebbc70..3a54c8f26a 100644
--- a/epochX/cudacpp/gg_tt.mad/Source/vector.inc
+++ b/epochX/cudacpp/gg_tt.mad/Source/vector.inc
@@ -28,4 +28,4 @@ C     BECAUSE IT DOES NOT GO THROUGH THE CPP PREPROCESSOR
 C     (see https://github.com/madgraph5/madgraph4gpu/issues/458).
 C     
       INTEGER VECSIZE_MEMMAX
-      PARAMETER (VECSIZE_MEMMAX=16384)
+      PARAMETER (VECSIZE_MEMMAX=16)
diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/gg_tt.mad/bin/internal/launch_plugin.py
index 0b849330ef..3527be0e2c 100644
--- a/epochX/cudacpp/gg_tt.mad/bin/internal/launch_plugin.py
+++ b/epochX/cudacpp/gg_tt.mad/bin/internal/launch_plugin.py
@@ -16,22 +16,28 @@
     import internal.misc as misc
     import internal.extended_cmd as extended_cmd
     import internal.banner as banner_mod
+    import internal.common_run_interface as common_run_interface
 else:
     import madgraph.interface.madevent_interface as madevent_interface
     import madgraph.various.misc as misc
     import madgraph.interface.extended_cmd as extended_cmd
     import madgraph.various.banner as banner_mod
+    import madgraph.interface.common_run_interface as common_run_interface
 
 class CPPMEInterface(madevent_interface.MadEventCmdShell):
     def compile(self, *args, **opts):
         """ """
         import multiprocessing
         if not self.options['nb_core'] or self.options['nb_core'] == 'None':
-            self.options['nb_core'] = multiprocessing.cpu_count()
-        if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'):
-            import pathlib
-            import os
-            pjoin = os.path.join
+            self.options['nb_core'] = multiprocessing.cpu_count()    
+        if 'cwd' in opts and os.path.basename(opts['cwd']) == 'Source':
+            path = pjoin(opts['cwd'], 'make_opts')
+            avx_type = self.run_card['avx_type'] if self.run_card['avx_type'] != 'auto' else ''
+            common_run_interface.CommonRunCmd.update_make_opts_full(path,
+                {'FPTYPE': self.run_card['floating_type'],
+                 'AVX':  avx_type })
+            misc.sprint('FPTYPE checked')
+        if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'):            
             cudacpp_backend = self.run_card['cudacpp_backend'].upper() # the default value is defined in banner.py
             logger.info("Building madevent in madevent_interface.py with '%s' matrix elements"%cudacpp_backend)
             if cudacpp_backend == 'FORTRAN':
@@ -46,7 +52,22 @@ def compile(self, *args, **opts):
         else:
             return misc.compile(nb_core=self.options['nb_core'], *args, **opts)
 
+# Phase-Space Optimization ------------------------------------------------------------------------------------
+template_on = \
+"""#*********************************************************************
+# SIMD/GPU Parametrization
+#*********************************************************************
+   %(floating_type)s = floating_type ! single precision(f), double precision (d), mixed (m) [double for amplitude, single for color]
+   %(avx_type)s =  avx_type  ! for SIMD, technology to use for the vectorization
+   %(cudacpp_backend)s = cudacpp_backend ! Fortran/CPP/CUDA switch mode to use
+"""
+
+template_off = ''
+plugin_block = banner_mod.RunBlock('simd', template_on=template_on, template_off=template_off)
+
 class CPPRunCard(banner_mod.RunCardLO):
+    blocks = banner_mod.RunCardLO.blocks + [plugin_block]
+
     def reset_simd(self, old_value, new_value, name):
         if not hasattr(self, 'path'):
             logger.warning('WARNING! CPPRunCard instance has no attribute path')
@@ -58,18 +79,59 @@ def reset_simd(self, old_value, new_value, name):
         Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source')
         subprocess.call(['make', 'cleanavx'], cwd=Sourcedir, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
 
+    def reset_makeopts(self, old_value, new_value, name):
+        if not hasattr(self, 'path'):
+            raise Exception
+        avx_value = self['avx_type'] if self['avx_type'] != 'auto' else ''
+        if name == 'floating_type':
+            common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': new_value, 'AVX': avx_value})
+        elif name == 'avx_type':
+            if new_value == 'Auto':
+                new_value = ''
+            common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': self['floating_type'], 'AVX': new_value})
+        else:
+            raise Exception
+        Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source')
+        subprocess.call(['make', 'cleanavx'], cwd=Sourcedir, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+
     def plugin_input(self, finput):
         return
 
     def default_setup(self):
         super().default_setup()
-        self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False)
+        self.add_param('floating_type', 'd', include=False, hidden=False,
+                       fct_mod=(self.reset_makeopts,(),{}),
+                       allowed=['m','d','f'])
+        self.add_param('avx_type', 'auto', include=False, hidden=False,
+                       fct_mod=(self.reset_makeopts,(),{}),
+                       allowed=['auto', 'none', 'sse4', 'avx2','512y','512z'])
+        self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False,
+                       allowed=['Fortan', 'CPP', 'CUDA'])
+        self['vector_size'] = 16 # already setup in default class (just change value)
+        self['aloha_flag'] = '--fast-math'
+        self['matrix_flag'] = '-O3'
+        self.display_block.append('simd')
+        self.display_block.append('psoptim')
 
+    # OM/AV - overload the default version in banner.py
     def write_one_include_file(self, output_dir, incname, output_file=None):
         """write one include file at the time"""
-        if incname == "vector.inc" and 'vector_size' not in self.user_set:
-            return
-        super().write_one_include_file(output_dir, incname, output_file)
+        if incname == "vector.inc":
+            if 'vector_size' not in self.user_set: return
+            if output_file is None: vectorinc=pjoin(output_dir,incname)
+            else: vectorinc=output_file
+            with open(vectorinc+'.new','w') as fileout:
+                with open(vectorinc) as filein:
+                    for line in filein:
+                        if line.startswith('C'): fileout.write(line)
+            super().write_one_include_file(output_dir, incname, output_file)
+            with open(vectorinc+'.new','a') as fileout:
+                with open(vectorinc) as filein:
+                    for line in filein:
+                        if not line.startswith('\n'): fileout.write(line)
+            os.replace(vectorinc+'.new',vectorinc)
+        else:
+            super().write_one_include_file(output_dir, incname, output_file)
 
     def check_validity(self):
         """ensure that PLUGIN information are consistent"""
@@ -82,13 +144,10 @@ def check_validity(self):
 
 class GPURunCard(CPPRunCard):
     def default_setup(self):
-        super(CPPRunCard, self).default_setup()
-        self.add_param('cudacpp_backend', 'CUDA', include=False, hidden=False)
-
-#class CUDACPPRunCard(CPPRunCard):
-#    def default_setup(self):
-#        super(CPPRunCard, self).default_setup()
-#        self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False)
+        super().default_setup()
+        # change default value:
+        self['cudacpp_backend'] = 'CUDA'
+        self['vector_size'] = 16384 # already setup in default class (just change value)
 
 MEINTERFACE = CPPMEInterface
 RunCard = CPPRunCard

From a96455b41018b1b70fec2cfafe906a7494645da5 Mon Sep 17 00:00:00 2001
From: Andrea Valassi <andrea.valassi@cern.ch>
Date: Wed, 15 Nov 2023 16:34:52 +0100
Subject: [PATCH 13/28] [floating_type_interface] in CODEGEN
 generateAndCompare.sh, modify the runcard to make sure that VECSIZE_MEMMAX is
 16384 instead of 16

---
 epochX/cudacpp/CODEGEN/generateAndCompare.sh | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/epochX/cudacpp/CODEGEN/generateAndCompare.sh b/epochX/cudacpp/CODEGEN/generateAndCompare.sh
index 7519fbcc7d..ff02dcbb10 100755
--- a/epochX/cudacpp/CODEGEN/generateAndCompare.sh
+++ b/epochX/cudacpp/CODEGEN/generateAndCompare.sh
@@ -215,9 +215,12 @@ function codeGenAndDiff()
   if [ "${OUTBCK}" == "mad" ]; then
     # Force the use of strategy SDE=1 in multichannel mode (see #419)
     sed -i 's/2  = sde_strategy/1  = sde_strategy/' ${outproc}/Cards/run_card.dat
+    # Force the use of VECSIZE_MEMMAX=16384
+    sed -i 's/16 = vector_size/16384 = vector_size/' ${outproc}/Cards/run_card.dat
     # Generate run_card.inc and param_card.inc (include stdout and stderr in the code generation log which is later checked for errors)
     # These two steps are part of "cd Source; make" but they actually are code-generating steps
-    ${outproc}/bin/madevent treatcards run  >> ${outproc}_log.txt 2>&1 # AV BUG! THIS MAY SILENTLY FAIL (check if output contains "Please report this bug")
+    # Note: treatcards run also regenerates vector.inc if vector_size has changed in the runcard
+    ${outproc}/bin/madevent treatcards run >> ${outproc}_log.txt 2>&1 # AV BUG! THIS MAY SILENTLY FAIL (check if output contains "Please report this bug")
     ${outproc}/bin/madevent treatcards param >> ${outproc}_log.txt 2>&1 # AV BUG! THIS MAY SILENTLY FAIL (check if output contains "Please report this bug")
     # Cleanup
     \rm -f ${outproc}/crossx.html

From ad54786f8e772052f5ca1e95a1deaf27f1ca5cbb Mon Sep 17 00:00:00 2001
From: Andrea Valassi <andrea.valassi@cern.ch>
Date: Wed, 15 Nov 2023 16:36:33 +0100
Subject: [PATCH 14/28] [floating_type_interface] regenerate gg_tt.mad, now
 vector.inc (and runcard.dat) use 16384 instead of 16

Note also that 'WARNING! CPPRunCard instance has no attribute path' has disappeared from the logs.
This is because vector.inc is no longer regenerated?
---
 .../cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt   | 15 +++++++--------
 epochX/cudacpp/gg_tt.mad/Cards/run_card.dat       |  2 +-
 epochX/cudacpp/gg_tt.mad/Source/vector.inc        |  2 +-
 3 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt
index bae9fd6d79..31d174d109 100644
--- a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt
+++ b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt
@@ -62,7 +62,7 @@ generate g g > t t~
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.005484580993652344 [0m
+[1;32mDEBUG: model prefixing  takes 0.0054912567138671875 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -175,7 +175,7 @@ INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1
 INFO: Processing color information for process: g g > t t~ @1 
 INFO: Creating files in directory P1_gg_ttx 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f5131cf69d0> [1;30m[export_v4.py at line 6262][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f331da9d9d0> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -191,11 +191,11 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 INFO: Generating Feynman diagrams for Process: g g > t t~ WEIGHTED<=2 @1 
 INFO: Finding symmetric diagrams for subprocess group gg_ttx 
 Generated helas calls for 1 subprocesses (3 diagrams) in 0.006 s
-Wrote files for 10 helas calls in 0.100 s
+Wrote files for 10 helas calls in 0.101 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 set of routines with options: P0[0m
 ALOHA: aloha creates FFV1 routines[0m
-ALOHA: aloha creates 2 routines in  0.142 s
+ALOHA: aloha creates 2 routines in  0.144 s
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 202][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 set of routines with options: P0[0m
@@ -237,9 +237,9 @@ Type "launch" to generate events from this process, or see
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m1.677s
-user	0m1.472s
-sys	0m0.204s
+real	0m1.674s
+user	0m1.441s
+sys	0m0.228s
 ************************************************************
 *                                                          *
 *                      W E L C O M E to                    *
@@ -267,7 +267,6 @@ Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 No valid web browser found. Please set in ./input/mg5_configuration.txt
 treatcards run
-[1;34mWARNING! CPPRunCard instance has no attribute path[0m
 quit
 INFO:  
 launch in debug mode
diff --git a/epochX/cudacpp/gg_tt.mad/Cards/run_card.dat b/epochX/cudacpp/gg_tt.mad/Cards/run_card.dat
index 46d89bbc4f..9d601c6e36 100644
--- a/epochX/cudacpp/gg_tt.mad/Cards/run_card.dat
+++ b/epochX/cudacpp/gg_tt.mad/Cards/run_card.dat
@@ -95,7 +95,7 @@
    -O = global_flag ! fortran optimization flag use for the all code.
    --fast-math  = aloha_flag ! fortran optimization flag for aloha function. Suggestions: '-ffast-math'
    -O3 = matrix_flag ! fortran optimization flag for matrix.f function. Suggestions: '-O3'
-   16 = vector_size ! size designed for SIMD/OpenMP/GPU (number of events in lockstep)
+   16384 = vector_size ! size designed for SIMD/OpenMP/GPU (number of events in lockstep)
 			     
 #*********************************************************************
 # Customization (custom cuts/scale/bias/...)                         *
diff --git a/epochX/cudacpp/gg_tt.mad/Source/vector.inc b/epochX/cudacpp/gg_tt.mad/Source/vector.inc
index 3a54c8f26a..863eebbc70 100644
--- a/epochX/cudacpp/gg_tt.mad/Source/vector.inc
+++ b/epochX/cudacpp/gg_tt.mad/Source/vector.inc
@@ -28,4 +28,4 @@ C     BECAUSE IT DOES NOT GO THROUGH THE CPP PREPROCESSOR
 C     (see https://github.com/madgraph5/madgraph4gpu/issues/458).
 C     
       INTEGER VECSIZE_MEMMAX
-      PARAMETER (VECSIZE_MEMMAX=16)
+      PARAMETER (VECSIZE_MEMMAX=16384)

From b291e3bea0d09176ddfedbdaa0afe4c9b141e7c2 Mon Sep 17 00:00:00 2001
From: Andrea Valassi <andrea.valassi@cern.ch>
Date: Wed, 15 Nov 2023 17:33:36 +0100
Subject: [PATCH 15/28] [floating_type_interface] in CODEGEN
 generateAndCompare.sh, use --vector_size=32 to ensure that vector.inc is
 recreated when the runcard is modified to use 16384

Indeed, the warning now appears again "WARNING! CPPRunCard instance has no attribute path" when regenerating ggtt.mad
---
 epochX/cudacpp/CODEGEN/generateAndCompare.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/epochX/cudacpp/CODEGEN/generateAndCompare.sh b/epochX/cudacpp/CODEGEN/generateAndCompare.sh
index ff02dcbb10..b790ca5081 100755
--- a/epochX/cudacpp/CODEGEN/generateAndCompare.sh
+++ b/epochX/cudacpp/CODEGEN/generateAndCompare.sh
@@ -152,7 +152,7 @@ function codeGenAndDiff()
   echo -e "\n+++ Generate code for '$proc'\n"
   ###exit 0 # FOR DEBUGGING
   # Vector size for mad/madonly meexporter (VECSIZE_MEMMAX)
-  vecsize=16384 # NB THIS IS NO LONGER IGNORED (but will eventually be tunable via runcards)
+  vecsize=32 # NB THIS IS NO LONGER IGNORED (but will eventually be tunable via runcards)
   # Generate code for the specific process
   pushd $MG5AMC_HOME >& /dev/null
   mkdir -p ../TMPOUT

From bc93a0fd6357b75a9e8a17192bd76490f060da5f Mon Sep 17 00:00:00 2001
From: Andrea Valassi <andrea.valassi@cern.ch>
Date: Wed, 15 Nov 2023 17:36:11 +0100
Subject: [PATCH 16/28] [floating_type_interface] regenerate gg_tt.mad, the
 warning now appears again "WARNING! CPPRunCard instance has no attribute
 path"

---
 .../gg_tt.mad/CODEGEN_mad_gg_tt_log.txt       | 27 ++++++++++---------
 .../cudacpp/gg_tt.mad/Cards/proc_card_mg5.dat |  2 +-
 epochX/cudacpp/gg_tt.mad/mg5.in               |  2 +-
 3 files changed, 16 insertions(+), 15 deletions(-)

diff --git a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt
index 31d174d109..75a4522df3 100644
--- a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt
+++ b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt
@@ -62,7 +62,7 @@ generate g g > t t~
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.0054912567138671875 [0m
+[1;32mDEBUG: model prefixing  takes 0.0056798458099365234 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -157,7 +157,7 @@ INFO: Trying process: g g > t t~ WEIGHTED<=2 @1
 INFO: Process has 3 diagrams 
 1 processes with 3 diagrams generated in 0.008 s
 Total: 1 processes with 3 diagrams
-output madevent ../TMPOUT/CODEGEN_mad_gg_tt --hel_recycling=False --vector_size=16384 --me_exporter=standalone_cudacpp
+output madevent ../TMPOUT/CODEGEN_mad_gg_tt --hel_recycling=False --vector_size=32 --me_exporter=standalone_cudacpp
 Load PLUGIN.CUDACPP_OUTPUT
 [1mAddition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
@@ -175,7 +175,7 @@ INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1
 INFO: Processing color information for process: g g > t t~ @1 
 INFO: Creating files in directory P1_gg_ttx 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f331da9d9d0> [1;30m[export_v4.py at line 6262][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7ff3a201e9a0> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -184,23 +184,23 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 [1;32mDEBUG:  config_map = [0m [1, 2, 3] [1;30m[export_cpp.py at line 711][0m [0m
 [1;32mDEBUG:  subproc_number = [0m 0 [1;30m[export_cpp.py at line 712][0m [0m
 [1;32mDEBUG:  Done [1;30m[export_cpp.py at line 713][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 32 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 32 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 32 True 32 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 32 True 32 [1;30m[export_v4.py at line 1872][0m [0m
 INFO: Generating Feynman diagrams for Process: g g > t t~ WEIGHTED<=2 @1 
 INFO: Finding symmetric diagrams for subprocess group gg_ttx 
 Generated helas calls for 1 subprocesses (3 diagrams) in 0.006 s
-Wrote files for 10 helas calls in 0.101 s
+Wrote files for 10 helas calls in 0.106 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 set of routines with options: P0[0m
 ALOHA: aloha creates FFV1 routines[0m
-ALOHA: aloha creates 2 routines in  0.144 s
+ALOHA: aloha creates 2 routines in  0.153 s
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 202][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 set of routines with options: P0[0m
 ALOHA: aloha creates FFV1 routines[0m
-ALOHA: aloha creates 4 routines in  0.130 s
+ALOHA: aloha creates 4 routines in  0.139 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -237,9 +237,9 @@ Type "launch" to generate events from this process, or see
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m1.674s
-user	0m1.441s
-sys	0m0.228s
+real	0m1.874s
+user	0m1.545s
+sys	0m0.221s
 ************************************************************
 *                                                          *
 *                      W E L C O M E to                    *
@@ -267,6 +267,7 @@ Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 No valid web browser found. Please set in ./input/mg5_configuration.txt
 treatcards run
+[1;34mWARNING! CPPRunCard instance has no attribute path[0m
 quit
 INFO:  
 launch in debug mode
diff --git a/epochX/cudacpp/gg_tt.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/gg_tt.mad/Cards/proc_card_mg5.dat
index 4c14989a3f..cf111e2e6d 100644
--- a/epochX/cudacpp/gg_tt.mad/Cards/proc_card_mg5.dat
+++ b/epochX/cudacpp/gg_tt.mad/Cards/proc_card_mg5.dat
@@ -46,4 +46,4 @@ define l- = e- mu-
 define vl = ve vm vt
 define vl~ = ve~ vm~ vt~
 output madevent ../TMPOUT/CODEGEN_mad_gg_tt --hel_recycling=False --ve\
-ctor_size=16384 --me_exporter=standalone_cudacpp
+ctor_size=32 --me_exporter=standalone_cudacpp
diff --git a/epochX/cudacpp/gg_tt.mad/mg5.in b/epochX/cudacpp/gg_tt.mad/mg5.in
index 7859bf9b80..b4b356fc51 100644
--- a/epochX/cudacpp/gg_tt.mad/mg5.in
+++ b/epochX/cudacpp/gg_tt.mad/mg5.in
@@ -1,4 +1,4 @@
 set stdout_level DEBUG
 set zerowidth_tchannel F
 generate g g > t t~
-output madevent gg_tt.mad --hel_recycling=False --vector_size=16384 --me_exporter=standalone_cudacpp
+output madevent gg_tt.mad --hel_recycling=False --vector_size=32 --me_exporter=standalone_cudacpp

From 9eb8b4820eaef37e180d485ea1d37a619b1aa631 Mon Sep 17 00:00:00 2001
From: Andrea Valassi <andrea.valassi@cern.ch>
Date: Wed, 15 Nov 2023 17:38:48 +0100
Subject: [PATCH 17/28] [floating_type_interface] upgrade MG5aMC/mg5amcnlo from
 d8c1613cc to 7b8964e28 including Olivier's patch for warning #790

---
 MG5aMC/mg5amcnlo | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/MG5aMC/mg5amcnlo b/MG5aMC/mg5amcnlo
index d8c1613ccf..7b8964e28c 160000
--- a/MG5aMC/mg5amcnlo
+++ b/MG5aMC/mg5amcnlo
@@ -1 +1 @@
-Subproject commit d8c1613ccf638b5b078a64379e385def5649622c
+Subproject commit 7b8964e28c7ea658d11a5224485e7fd918d7a63a

From ca4c1628e22806b39cb7ac948898e0e0449c8658 Mon Sep 17 00:00:00 2001
From: Andrea Valassi <andrea.valassi@cern.ch>
Date: Wed, 15 Nov 2023 17:40:23 +0100
Subject: [PATCH 18/28] [floating_type_interface] regenerate gg_tt.mad, the
 warning now has disappered thanks to Olivier's fix for #790

---
 .../gg_tt.mad/CODEGEN_mad_gg_tt_log.txt        | 18 +++++++++---------
 .../cudacpp/gg_tt.mad/bin/internal/banner.py   |  4 +++-
 .../bin/internal/madevent_interface.py         |  2 +-
 3 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt
index 75a4522df3..8377ba9a6a 100644
--- a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt
+++ b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt
@@ -2,6 +2,7 @@
 This version is intended for development/beta testing and NOT for production.
 This version has not been fully tested (if at all) and might have limited user support (if at all)[0m
 Running MG5 in debug mode
+('WARNING: loading of madgraph too slow!!!', 0.515019416809082)
 ************************************************************
 *                                                          *
 *                     W E L C O M E to                     *
@@ -62,7 +63,7 @@ generate g g > t t~
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.0056798458099365234 [0m
+[1;32mDEBUG: model prefixing  takes 0.005373716354370117 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -175,7 +176,7 @@ INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1
 INFO: Processing color information for process: g g > t t~ @1 
 INFO: Creating files in directory P1_gg_ttx 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7ff3a201e9a0> [1;30m[export_v4.py at line 6262][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f30b2c58bb0> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -191,16 +192,16 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 INFO: Generating Feynman diagrams for Process: g g > t t~ WEIGHTED<=2 @1 
 INFO: Finding symmetric diagrams for subprocess group gg_ttx 
 Generated helas calls for 1 subprocesses (3 diagrams) in 0.006 s
-Wrote files for 10 helas calls in 0.106 s
+Wrote files for 10 helas calls in 0.100 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 set of routines with options: P0[0m
 ALOHA: aloha creates FFV1 routines[0m
-ALOHA: aloha creates 2 routines in  0.153 s
+ALOHA: aloha creates 2 routines in  0.143 s
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 202][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 set of routines with options: P0[0m
 ALOHA: aloha creates FFV1 routines[0m
-ALOHA: aloha creates 4 routines in  0.139 s
+ALOHA: aloha creates 4 routines in  0.131 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -237,9 +238,9 @@ Type "launch" to generate events from this process, or see
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m1.874s
-user	0m1.545s
-sys	0m0.221s
+real	0m2.117s
+user	0m1.839s
+sys	0m0.239s
 ************************************************************
 *                                                          *
 *                      W E L C O M E to                    *
@@ -267,7 +268,6 @@ Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 No valid web browser found. Please set in ./input/mg5_configuration.txt
 treatcards run
-[1;34mWARNING! CPPRunCard instance has no attribute path[0m
 quit
 INFO:  
 launch in debug mode
diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/banner.py b/epochX/cudacpp/gg_tt.mad/bin/internal/banner.py
index 3995ce8109..651175405f 100755
--- a/epochX/cudacpp/gg_tt.mad/bin/internal/banner.py
+++ b/epochX/cudacpp/gg_tt.mad/bin/internal/banner.py
@@ -2822,13 +2822,15 @@ def add_param(self, name, value, fortran_name=None, include=True,
         if fct_mod:
             self.fct_mod[name] = fct_mod
 
-    def read(self, finput, consistency=True, unknown_warning=True):
+    def read(self, finput, consistency=True, unknown_warning=True, **opt):
         """Read the input file, this can be a path to a file, 
            a file object, a str with the content of the file."""
            
         if isinstance(finput, str):
             if "\n" in finput:
                 finput = finput.split('\n')
+                if 'path' in opt:
+                    self.path = opt['path']
             elif os.path.isfile(finput):
                 self.path = finput
                 finput = open(finput)
diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/madevent_interface.py b/epochX/cudacpp/gg_tt.mad/bin/internal/madevent_interface.py
index 853aabc98a..cb6bf4ca57 100755
--- a/epochX/cudacpp/gg_tt.mad/bin/internal/madevent_interface.py
+++ b/epochX/cudacpp/gg_tt.mad/bin/internal/madevent_interface.py
@@ -3191,7 +3191,7 @@ def do_treatcards(self, line, mode=None, opt=None):
       
         if mode in ['run', 'all']:
             if not hasattr(self, 'run_card'):
-                run_card = banner_mod.RunCard(opt['run_card'])
+                run_card = banner_mod.RunCard(opt['run_card'], path=pjoin(self.me_dir, 'Cards', 'run_card.dat'))
             else:
                 run_card = self.run_card
             self.run_card = run_card

From 8422d19748232930d634e95547c0bb6b28226ece Mon Sep 17 00:00:00 2001
From: Andrea Valassi <andrea.valassi@cern.ch>
Date: Wed, 15 Nov 2023 17:42:59 +0100
Subject: [PATCH 19/28] [floating_type_interface] in CODEGEN launch_plugin.py,
 remove the workaround for #790, now fixed in mg5amcnlo

---
 .../cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/launch_plugin.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/launch_plugin.py b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/launch_plugin.py
index 3527be0e2c..797560fcce 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/launch_plugin.py
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/launch_plugin.py
@@ -70,9 +70,7 @@ class CPPRunCard(banner_mod.RunCardLO):
 
     def reset_simd(self, old_value, new_value, name):
         if not hasattr(self, 'path'):
-            logger.warning('WARNING! CPPRunCard instance has no attribute path')
-            return
-            ###raise Exception('INTERNAL ERROR! CPPRunCard instance has no attribute path')
+            raise Exception('INTERNAL ERROR! CPPRunCard instance has no attribute path') # now ok after fixing #790
         if name == "vector_size" and new_value <= int(old_value):
             # code can handle the new size -> do not recompile
             return

From 1045a420891c3035c3a78e7b6bee896f9caf3e17 Mon Sep 17 00:00:00 2001
From: Andrea Valassi <andrea.valassi@cern.ch>
Date: Wed, 15 Nov 2023 17:44:05 +0100
Subject: [PATCH 20/28] [floating_type_interface] regenerate gg_tt.mad, there
 is no exception thrown after fixing #790 and removing the workaround

---
 .../cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt   | 15 +++++++--------
 .../gg_tt.mad/bin/internal/launch_plugin.py       |  4 +---
 2 files changed, 8 insertions(+), 11 deletions(-)

diff --git a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt
index 8377ba9a6a..4cb42aacca 100644
--- a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt
+++ b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt
@@ -2,7 +2,6 @@
 This version is intended for development/beta testing and NOT for production.
 This version has not been fully tested (if at all) and might have limited user support (if at all)[0m
 Running MG5 in debug mode
-('WARNING: loading of madgraph too slow!!!', 0.515019416809082)
 ************************************************************
 *                                                          *
 *                     W E L C O M E to                     *
@@ -63,7 +62,7 @@ generate g g > t t~
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.005373716354370117 [0m
+[1;32mDEBUG: model prefixing  takes 0.005273580551147461 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -176,7 +175,7 @@ INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1
 INFO: Processing color information for process: g g > t t~ @1 
 INFO: Creating files in directory P1_gg_ttx 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f30b2c58bb0> [1;30m[export_v4.py at line 6262][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f4bf784a9d0> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -196,12 +195,12 @@ Wrote files for 10 helas calls in 0.100 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 set of routines with options: P0[0m
 ALOHA: aloha creates FFV1 routines[0m
-ALOHA: aloha creates 2 routines in  0.143 s
+ALOHA: aloha creates 2 routines in  0.144 s
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 202][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 set of routines with options: P0[0m
 ALOHA: aloha creates FFV1 routines[0m
-ALOHA: aloha creates 4 routines in  0.131 s
+ALOHA: aloha creates 4 routines in  0.132 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -238,9 +237,9 @@ Type "launch" to generate events from this process, or see
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m2.117s
-user	0m1.839s
-sys	0m0.239s
+real	0m1.719s
+user	0m1.469s
+sys	0m0.213s
 ************************************************************
 *                                                          *
 *                      W E L C O M E to                    *
diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/gg_tt.mad/bin/internal/launch_plugin.py
index 3527be0e2c..797560fcce 100644
--- a/epochX/cudacpp/gg_tt.mad/bin/internal/launch_plugin.py
+++ b/epochX/cudacpp/gg_tt.mad/bin/internal/launch_plugin.py
@@ -70,9 +70,7 @@ class CPPRunCard(banner_mod.RunCardLO):
 
     def reset_simd(self, old_value, new_value, name):
         if not hasattr(self, 'path'):
-            logger.warning('WARNING! CPPRunCard instance has no attribute path')
-            return
-            ###raise Exception('INTERNAL ERROR! CPPRunCard instance has no attribute path')
+            raise Exception('INTERNAL ERROR! CPPRunCard instance has no attribute path') # now ok after fixing #790
         if name == "vector_size" and new_value <= int(old_value):
             # code can handle the new size -> do not recompile
             return

From 72f12a152f77df51fb07409639e21072cb6ac4de Mon Sep 17 00:00:00 2001
From: Andrea Valassi <andrea.valassi@cern.ch>
Date: Wed, 15 Nov 2023 17:54:50 +0100
Subject: [PATCH 21/28] [floating_type_interface] in CODEGEN launch_plugin.py,
 improve cudacpp runcard comments and indentation

---
 .../PLUGIN/CUDACPP_SA_OUTPUT/launch_plugin.py        | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/launch_plugin.py b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/launch_plugin.py
index 797560fcce..cfec10d9d8 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/launch_plugin.py
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/launch_plugin.py
@@ -54,12 +54,12 @@ def compile(self, *args, **opts):
 
 # Phase-Space Optimization ------------------------------------------------------------------------------------
 template_on = \
-"""#*********************************************************************
-# SIMD/GPU Parametrization
-#*********************************************************************
-   %(floating_type)s = floating_type ! single precision(f), double precision (d), mixed (m) [double for amplitude, single for color]
-   %(avx_type)s =  avx_type  ! for SIMD, technology to use for the vectorization
-   %(cudacpp_backend)s = cudacpp_backend ! Fortran/CPP/CUDA switch mode to use
+"""#***********************************************************************
+# SIMD/GPU configuration for the CUDACPP plugin
+#************************************************************************
+ %(floating_type)s = floating_type ! floating point precision: f (single), d (double), m (mixed: double for amplitudes, single for colors)
+ %(avx_type)s = avx_type ! SIMD vectorization level: none, sse4, avx2, 512y, 512z
+ %(cudacpp_backend)s = cudacpp_backend ! CUDACPP backend: FORTRAN, CPP, CUDA
 """
 
 template_off = ''

From 843442c280c161c5e4d65c87722714d5c7eb677c Mon Sep 17 00:00:00 2001
From: Andrea Valassi <andrea.valassi@cern.ch>
Date: Wed, 15 Nov 2023 17:57:23 +0100
Subject: [PATCH 22/28] [floating_type_interface] in CODEGEN
 generateAndCompare.sh, do not add cudacpp_backend at the end of runcards, as
 it is now already there by default

---
 epochX/cudacpp/CODEGEN/generateAndCompare.sh | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/epochX/cudacpp/CODEGEN/generateAndCompare.sh b/epochX/cudacpp/CODEGEN/generateAndCompare.sh
index b790ca5081..99bf044767 100755
--- a/epochX/cudacpp/CODEGEN/generateAndCompare.sh
+++ b/epochX/cudacpp/CODEGEN/generateAndCompare.sh
@@ -251,10 +251,7 @@ function codeGenAndDiff()
 #*********************************************************************
 
 # Set cudacpp-specific values of non-cudacpp-specific options
--O3 -ffast-math -fbounds-check = global_flag ! build flags for Fortran code (for a fair comparison to cudacpp)
-
-# New cudacpp-specific options (default values are defined in banner.py)
-CPP = cudacpp_backend ! valid backends are FORTRAN, CPP, CUDA" >> ${outproc}/Cards/run_card.dat
+-O3 -ffast-math -fbounds-check = global_flag ! build flags for Fortran code (for a fair comparison to cudacpp)" >> ${outproc}/Cards/run_card.dat
     fi
   fi
   popd >& /dev/null

From 20364f5b16dff94fa24ea2e6dfc7b145cdd992a9 Mon Sep 17 00:00:00 2001
From: Andrea Valassi <andrea.valassi@cern.ch>
Date: Wed, 15 Nov 2023 18:11:28 +0100
Subject: [PATCH 23/28] [floating_type_interface] upgrade MG5aMC/mg5amcnlo to
 include my minor changes to runcard comments

---
 MG5aMC/mg5amcnlo | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/MG5aMC/mg5amcnlo b/MG5aMC/mg5amcnlo
index 7b8964e28c..23f61b93fd 160000
--- a/MG5aMC/mg5amcnlo
+++ b/MG5aMC/mg5amcnlo
@@ -1 +1 @@
-Subproject commit 7b8964e28c7ea658d11a5224485e7fd918d7a63a
+Subproject commit 23f61b93fdf268a1cdcbd363cd449c88b3511d7a

From e4eac91e8e10d2df390c159b82ec0d7a438cdfdf Mon Sep 17 00:00:00 2001
From: Andrea Valassi <andrea.valassi@cern.ch>
Date: Wed, 15 Nov 2023 18:12:12 +0100
Subject: [PATCH 24/28] [floating_type_interface] in CODEGEN
 generateAndCompare.sh, modify the existing global_flag in runcards, instead
 of adding a new line at the end

---
 epochX/cudacpp/CODEGEN/generateAndCompare.sh | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/epochX/cudacpp/CODEGEN/generateAndCompare.sh b/epochX/cudacpp/CODEGEN/generateAndCompare.sh
index 99bf044767..91341dbc4f 100755
--- a/epochX/cudacpp/CODEGEN/generateAndCompare.sh
+++ b/epochX/cudacpp/CODEGEN/generateAndCompare.sh
@@ -217,6 +217,8 @@ function codeGenAndDiff()
     sed -i 's/2  = sde_strategy/1  = sde_strategy/' ${outproc}/Cards/run_card.dat
     # Force the use of VECSIZE_MEMMAX=16384
     sed -i 's/16 = vector_size/16384 = vector_size/' ${outproc}/Cards/run_card.dat
+    # Force the use of fast-math in Fortran builds
+    sed -i 's/-O = global_flag.*/-O3 -ffast-math -fbounds-check = global_flag ! build flags for all Fortran code (for a fair comparison to cudacpp; default is -O)/' ${outproc}/Cards/run_card.dat
     # Generate run_card.inc and param_card.inc (include stdout and stderr in the code generation log which is later checked for errors)
     # These two steps are part of "cd Source; make" but they actually are code-generating steps
     # Note: treatcards run also regenerates vector.inc if vector_size has changed in the runcard
@@ -245,13 +247,6 @@ function codeGenAndDiff()
       cat ${outproc}/Source/make_opts | sed '/#end/q' | sort > ${outproc}/Source/make_opts.new
       cat ${outproc}/Source/make_opts | sed -n -e '/#end/,$p' >> ${outproc}/Source/make_opts.new
       \mv ${outproc}/Source/make_opts.new ${outproc}/Source/make_opts
-      echo "
-#*********************************************************************
-# Options for the cudacpp plugin
-#*********************************************************************
-
-# Set cudacpp-specific values of non-cudacpp-specific options
--O3 -ffast-math -fbounds-check = global_flag ! build flags for Fortran code (for a fair comparison to cudacpp)" >> ${outproc}/Cards/run_card.dat
     fi
   fi
   popd >& /dev/null

From 01a2bfa3011b8b6932d3bb65e8bc8aecd10bf5fd Mon Sep 17 00:00:00 2001
From: Andrea Valassi <andrea.valassi@cern.ch>
Date: Wed, 15 Nov 2023 18:28:25 +0100
Subject: [PATCH 25/28] [floating_type_interface] in CODEGEN
 generateAndCompare.sh, minor fix to avoid duplicate
 '#end_of_make_opts_variables' in make_opts

Also add a comment about GLOBAL_FLAG in make_opts (why is this not propagated from the runcards?)
---
 epochX/cudacpp/CODEGEN/generateAndCompare.sh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/epochX/cudacpp/CODEGEN/generateAndCompare.sh b/epochX/cudacpp/CODEGEN/generateAndCompare.sh
index 91341dbc4f..f3f830205c 100755
--- a/epochX/cudacpp/CODEGEN/generateAndCompare.sh
+++ b/epochX/cudacpp/CODEGEN/generateAndCompare.sh
@@ -238,13 +238,14 @@ function codeGenAndDiff()
     touch ${outproc}/HTML/.keep # new file
     if [ "${patchlevel}" != "0" ]; then
       # Add global flag '-O3 -ffast-math -fbounds-check' as in previous gridpacks
+      # (FIXME? these flags are already set in the runcards, why are they not propagated to make_opts?)
       echo "GLOBAL_FLAG=-O3 -ffast-math -fbounds-check" > ${outproc}/Source/make_opts.new
       cat ${outproc}/Source/make_opts >> ${outproc}/Source/make_opts.new
       \mv ${outproc}/Source/make_opts.new ${outproc}/Source/make_opts
     fi
     if [ "${patchlevel}" == "2" ]; then
       sed -i 's/DEFAULT_F2PY_COMPILER=f2py.*/DEFAULT_F2PY_COMPILER=f2py3/' ${outproc}/Source/make_opts
-      cat ${outproc}/Source/make_opts | sed '/#end/q' | sort > ${outproc}/Source/make_opts.new
+      cat ${outproc}/Source/make_opts | sed '/#end/q' | head --lines=-1 | sort > ${outproc}/Source/make_opts.new
       cat ${outproc}/Source/make_opts | sed -n -e '/#end/,$p' >> ${outproc}/Source/make_opts.new
       \mv ${outproc}/Source/make_opts.new ${outproc}/Source/make_opts
     fi

From 5fc54dc840835485102467b2362e46f0aebef420 Mon Sep 17 00:00:00 2001
From: Andrea Valassi <andrea.valassi@cern.ch>
Date: Wed, 15 Nov 2023 18:47:25 +0100
Subject: [PATCH 26/28] [floating_type_interface] in CODEGEN launch_plugin.py,
 rename avx_type as avx_level in the runcard (and fix a minor bug, replace
 Auto by auto)

---
 .../PLUGIN/CUDACPP_SA_OUTPUT/launch_plugin.py     | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/launch_plugin.py b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/launch_plugin.py
index cfec10d9d8..3b09713e12 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/launch_plugin.py
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/launch_plugin.py
@@ -32,10 +32,10 @@ def compile(self, *args, **opts):
             self.options['nb_core'] = multiprocessing.cpu_count()    
         if 'cwd' in opts and os.path.basename(opts['cwd']) == 'Source':
             path = pjoin(opts['cwd'], 'make_opts')
-            avx_type = self.run_card['avx_type'] if self.run_card['avx_type'] != 'auto' else ''
+            avx_level = self.run_card['avx_level'] if self.run_card['avx_level'] != 'auto' else ''
             common_run_interface.CommonRunCmd.update_make_opts_full(path,
                 {'FPTYPE': self.run_card['floating_type'],
-                 'AVX':  avx_type })
+                 'AVX': avx_level })
             misc.sprint('FPTYPE checked')
         if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'):            
             cudacpp_backend = self.run_card['cudacpp_backend'].upper() # the default value is defined in banner.py
@@ -58,7 +58,7 @@ def compile(self, *args, **opts):
 # SIMD/GPU configuration for the CUDACPP plugin
 #************************************************************************
  %(floating_type)s = floating_type ! floating point precision: f (single), d (double), m (mixed: double for amplitudes, single for colors)
- %(avx_type)s = avx_type ! SIMD vectorization level: none, sse4, avx2, 512y, 512z
+ %(avx_level)s = avx_level ! SIMD vectorization level: none, sse4, avx2, 512y, 512z, auto
  %(cudacpp_backend)s = cudacpp_backend ! CUDACPP backend: FORTRAN, CPP, CUDA
 """
 
@@ -80,12 +80,11 @@ def reset_simd(self, old_value, new_value, name):
     def reset_makeopts(self, old_value, new_value, name):
         if not hasattr(self, 'path'):
             raise Exception
-        avx_value = self['avx_type'] if self['avx_type'] != 'auto' else ''
+        avx_value = self['avx_level'] if self['avx_level'] != 'auto' else ''
         if name == 'floating_type':
             common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': new_value, 'AVX': avx_value})
-        elif name == 'avx_type':
-            if new_value == 'Auto':
-                new_value = ''
+        elif name == 'avx_level':
+            if new_value == 'auto': new_value = ''
             common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': self['floating_type'], 'AVX': new_value})
         else:
             raise Exception
@@ -100,7 +99,7 @@ def default_setup(self):
         self.add_param('floating_type', 'd', include=False, hidden=False,
                        fct_mod=(self.reset_makeopts,(),{}),
                        allowed=['m','d','f'])
-        self.add_param('avx_type', 'auto', include=False, hidden=False,
+        self.add_param('avx_level', 'auto', include=False, hidden=False,
                        fct_mod=(self.reset_makeopts,(),{}),
                        allowed=['auto', 'none', 'sse4', 'avx2','512y','512z'])
         self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False,

From 795ae3752e88f8d1307d2e1f7d7ed25ee445225e Mon Sep 17 00:00:00 2001
From: Andrea Valassi <andrea.valassi@cern.ch>
Date: Wed, 15 Nov 2023 18:48:51 +0100
Subject: [PATCH 27/28] [floating_type_interface] regenerate gg_tt.mad, all
 looks ok (only make_opts changes but only a comment is removed in there)

---
 .../gg_tt.mad/CODEGEN_mad_gg_tt_log.txt       | 14 +++++-----
 epochX/cudacpp/gg_tt.mad/Cards/run_card.dat   | 26 ++++++-------------
 .../gg_tt.mad/Cards/run_card_default.dat      | 14 +++++-----
 epochX/cudacpp/gg_tt.mad/Source/make_opts     |  1 -
 .../cudacpp/gg_tt.mad/bin/internal/banner.py  |  2 +-
 .../gg_tt.mad/bin/internal/launch_plugin.py   | 25 +++++++++---------
 6 files changed, 35 insertions(+), 47 deletions(-)

diff --git a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt
index 4cb42aacca..9082dbc054 100644
--- a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt
+++ b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt
@@ -62,7 +62,7 @@ generate g g > t t~
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.005273580551147461 [0m
+[1;32mDEBUG: model prefixing  takes 0.00536799430847168 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -175,7 +175,7 @@ INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1
 INFO: Processing color information for process: g g > t t~ @1 
 INFO: Creating files in directory P1_gg_ttx 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f4bf784a9d0> [1;30m[export_v4.py at line 6262][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f8b48a4f9a0> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -195,12 +195,12 @@ Wrote files for 10 helas calls in 0.100 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 set of routines with options: P0[0m
 ALOHA: aloha creates FFV1 routines[0m
-ALOHA: aloha creates 2 routines in  0.144 s
+ALOHA: aloha creates 2 routines in  0.143 s
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 202][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 set of routines with options: P0[0m
 ALOHA: aloha creates FFV1 routines[0m
-ALOHA: aloha creates 4 routines in  0.132 s
+ALOHA: aloha creates 4 routines in  0.130 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -237,9 +237,9 @@ Type "launch" to generate events from this process, or see
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m1.719s
-user	0m1.469s
-sys	0m0.213s
+real	0m1.673s
+user	0m1.458s
+sys	0m0.207s
 ************************************************************
 *                                                          *
 *                      W E L C O M E to                    *
diff --git a/epochX/cudacpp/gg_tt.mad/Cards/run_card.dat b/epochX/cudacpp/gg_tt.mad/Cards/run_card.dat
index 9d601c6e36..a6463dc262 100644
--- a/epochX/cudacpp/gg_tt.mad/Cards/run_card.dat
+++ b/epochX/cudacpp/gg_tt.mad/Cards/run_card.dat
@@ -92,10 +92,10 @@
 #*********************************************************************
 # Compilation flag. 
 #*********************************************************************   
-   -O = global_flag ! fortran optimization flag use for the all code.
+   -O3 -ffast-math -fbounds-check = global_flag ! build flags for all Fortran code (for a fair comparison to cudacpp; default is -O)
    --fast-math  = aloha_flag ! fortran optimization flag for aloha function. Suggestions: '-ffast-math'
    -O3 = matrix_flag ! fortran optimization flag for matrix.f function. Suggestions: '-O3'
-   16384 = vector_size ! size designed for SIMD/OpenMP/GPU (number of events in lockstep)
+   16384 = vector_size ! size of fortran arrays allocated in the multi-event API for SIMD/GPU (VECSIZE_MEMMAX)
 			     
 #*********************************************************************
 # Customization (custom cuts/scale/bias/...)                         *
@@ -156,20 +156,10 @@
 systematics = systematics_program ! none, systematics [python], SysCalc [depreceted, C++]
 ['--mur=0.5,1,2', '--muf=0.5,1,2', '--pdf=errorset'] = systematics_arguments ! see: https://cp3.irmp.ucl.ac.be/projects/madgraph/wiki/Systematics#Systematicspythonmodule
 
-#*********************************************************************
-# SIMD/GPU Parametrization
-#*********************************************************************
-   d = floating_type ! single precision(f), double precision (d), mixed (m) [double for amplitude, single for color]
-   auto =  avx_type  ! for SIMD, technology to use for the vectorization
-   CPP = cudacpp_backend ! Fortran/CPP/CUDA switch mode to use
-
-
-#*********************************************************************
-# Options for the cudacpp plugin
-#*********************************************************************
-
-# Set cudacpp-specific values of non-cudacpp-specific options
--O3 -ffast-math -fbounds-check = global_flag ! build flags for Fortran code (for a fair comparison to cudacpp)
+#***********************************************************************
+# SIMD/GPU configuration for the CUDACPP plugin
+#************************************************************************
+ d = floating_type ! floating point precision: f (single), d (double), m (mixed: double for amplitudes, single for colors)
+ auto = avx_level ! SIMD vectorization level: none, sse4, avx2, 512y, 512z, auto
+ CPP = cudacpp_backend ! CUDACPP backend: FORTRAN, CPP, CUDA
 
-# New cudacpp-specific options (default values are defined in banner.py)
-CPP = cudacpp_backend ! valid backends are FORTRAN, CPP, CUDA
diff --git a/epochX/cudacpp/gg_tt.mad/Cards/run_card_default.dat b/epochX/cudacpp/gg_tt.mad/Cards/run_card_default.dat
index 7697a3f769..27e990a016 100644
--- a/epochX/cudacpp/gg_tt.mad/Cards/run_card_default.dat
+++ b/epochX/cudacpp/gg_tt.mad/Cards/run_card_default.dat
@@ -95,7 +95,7 @@
    -O = global_flag ! fortran optimization flag use for the all code.
    --fast-math  = aloha_flag ! fortran optimization flag for aloha function. Suggestions: '-ffast-math'
    -O3 = matrix_flag ! fortran optimization flag for matrix.f function. Suggestions: '-O3'
-   16 = vector_size ! size designed for SIMD/OpenMP/GPU (number of events in lockstep)
+   16 = vector_size ! size of fortran arrays allocated in the multi-event API for SIMD/GPU (VECSIZE_MEMMAX)
 			     
 #*********************************************************************
 # Customization (custom cuts/scale/bias/...)                         *
@@ -156,10 +156,10 @@
 systematics = systematics_program ! none, systematics [python], SysCalc [depreceted, C++]
 ['--mur=0.5,1,2', '--muf=0.5,1,2', '--pdf=errorset'] = systematics_arguments ! see: https://cp3.irmp.ucl.ac.be/projects/madgraph/wiki/Systematics#Systematicspythonmodule
 
-#*********************************************************************
-# SIMD/GPU Parametrization
-#*********************************************************************
-   d = floating_type ! single precision(f), double precision (d), mixed (m) [double for amplitude, single for color]
-   auto =  avx_type  ! for SIMD, technology to use for the vectorization
-   CPP = cudacpp_backend ! Fortran/CPP/CUDA switch mode to use
+#***********************************************************************
+# SIMD/GPU configuration for the CUDACPP plugin
+#************************************************************************
+ d = floating_type ! floating point precision: f (single), d (double), m (mixed: double for amplitudes, single for colors)
+ auto = avx_level ! SIMD vectorization level: none, sse4, avx2, 512y, 512z, auto
+ CPP = cudacpp_backend ! CUDACPP backend: FORTRAN, CPP, CUDA
 
diff --git a/epochX/cudacpp/gg_tt.mad/Source/make_opts b/epochX/cudacpp/gg_tt.mad/Source/make_opts
index 6409e99a49..e4b87ee6ad 100644
--- a/epochX/cudacpp/gg_tt.mad/Source/make_opts
+++ b/epochX/cudacpp/gg_tt.mad/Source/make_opts
@@ -1,7 +1,6 @@
 DEFAULT_CPP_COMPILER=g++
 DEFAULT_F2PY_COMPILER=f2py3
 DEFAULT_F_COMPILER=gfortran
-#end_of_make_opts_variables
 GLOBAL_FLAG=-O3 -ffast-math -fbounds-check
 MACFLAG=
 MG5AMC_VERSION=SpecifiedByMG5aMCAtRunTime
diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/banner.py b/epochX/cudacpp/gg_tt.mad/bin/internal/banner.py
index 651175405f..bd1517985f 100755
--- a/epochX/cudacpp/gg_tt.mad/bin/internal/banner.py
+++ b/epochX/cudacpp/gg_tt.mad/bin/internal/banner.py
@@ -3915,7 +3915,7 @@ def remove_all_cut(self):
    %(global_flag)s = global_flag ! fortran optimization flag use for the all code.
    %(aloha_flag)s  = aloha_flag ! fortran optimization flag for aloha function. Suggestions: '-ffast-math'
    %(matrix_flag)s = matrix_flag ! fortran optimization flag for matrix.f function. Suggestions: '-O3'
-   %(vector_size)s = vector_size ! size designed for SIMD/OpenMP/GPU (number of events in lockstep)
+   %(vector_size)s = vector_size ! size of fortran arrays allocated in the multi-event API for SIMD/GPU (VECSIZE_MEMMAX)
 """
 
 template_off = '# To see advanced option for Phase-Space optimization: type "update psoptim"'
diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/gg_tt.mad/bin/internal/launch_plugin.py
index 797560fcce..3b09713e12 100644
--- a/epochX/cudacpp/gg_tt.mad/bin/internal/launch_plugin.py
+++ b/epochX/cudacpp/gg_tt.mad/bin/internal/launch_plugin.py
@@ -32,10 +32,10 @@ def compile(self, *args, **opts):
             self.options['nb_core'] = multiprocessing.cpu_count()    
         if 'cwd' in opts and os.path.basename(opts['cwd']) == 'Source':
             path = pjoin(opts['cwd'], 'make_opts')
-            avx_type = self.run_card['avx_type'] if self.run_card['avx_type'] != 'auto' else ''
+            avx_level = self.run_card['avx_level'] if self.run_card['avx_level'] != 'auto' else ''
             common_run_interface.CommonRunCmd.update_make_opts_full(path,
                 {'FPTYPE': self.run_card['floating_type'],
-                 'AVX':  avx_type })
+                 'AVX': avx_level })
             misc.sprint('FPTYPE checked')
         if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'):            
             cudacpp_backend = self.run_card['cudacpp_backend'].upper() # the default value is defined in banner.py
@@ -54,12 +54,12 @@ def compile(self, *args, **opts):
 
 # Phase-Space Optimization ------------------------------------------------------------------------------------
 template_on = \
-"""#*********************************************************************
-# SIMD/GPU Parametrization
-#*********************************************************************
-   %(floating_type)s = floating_type ! single precision(f), double precision (d), mixed (m) [double for amplitude, single for color]
-   %(avx_type)s =  avx_type  ! for SIMD, technology to use for the vectorization
-   %(cudacpp_backend)s = cudacpp_backend ! Fortran/CPP/CUDA switch mode to use
+"""#***********************************************************************
+# SIMD/GPU configuration for the CUDACPP plugin
+#************************************************************************
+ %(floating_type)s = floating_type ! floating point precision: f (single), d (double), m (mixed: double for amplitudes, single for colors)
+ %(avx_level)s = avx_level ! SIMD vectorization level: none, sse4, avx2, 512y, 512z, auto
+ %(cudacpp_backend)s = cudacpp_backend ! CUDACPP backend: FORTRAN, CPP, CUDA
 """
 
 template_off = ''
@@ -80,12 +80,11 @@ def reset_simd(self, old_value, new_value, name):
     def reset_makeopts(self, old_value, new_value, name):
         if not hasattr(self, 'path'):
             raise Exception
-        avx_value = self['avx_type'] if self['avx_type'] != 'auto' else ''
+        avx_value = self['avx_level'] if self['avx_level'] != 'auto' else ''
         if name == 'floating_type':
             common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': new_value, 'AVX': avx_value})
-        elif name == 'avx_type':
-            if new_value == 'Auto':
-                new_value = ''
+        elif name == 'avx_level':
+            if new_value == 'auto': new_value = ''
             common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': self['floating_type'], 'AVX': new_value})
         else:
             raise Exception
@@ -100,7 +99,7 @@ def default_setup(self):
         self.add_param('floating_type', 'd', include=False, hidden=False,
                        fct_mod=(self.reset_makeopts,(),{}),
                        allowed=['m','d','f'])
-        self.add_param('avx_type', 'auto', include=False, hidden=False,
+        self.add_param('avx_level', 'auto', include=False, hidden=False,
                        fct_mod=(self.reset_makeopts,(),{}),
                        allowed=['auto', 'none', 'sse4', 'avx2','512y','512z'])
         self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False,

From 4c912508e57892729d2a57ddb95a2b78b37447ed Mon Sep 17 00:00:00 2001
From: Andrea Valassi <andrea.valassi@cern.ch>
Date: Wed, 15 Nov 2023 18:54:52 +0100
Subject: [PATCH 28/28] [floating_type_interface] ** COMPLETE
 floating_type_interface ** regenerate all 15 processes

No need to rerun the tests, there are no real changes in the code with respect to upstream/master
---
 .../ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt   |  39 ++--
 .../ee_mumu.mad/Cards/proc_card_mg5.dat       |   2 +-
 epochX/cudacpp/ee_mumu.mad/Cards/run_card.dat |  32 ++-
 .../ee_mumu.mad/Cards/run_card_default.dat    |  26 ++-
 epochX/cudacpp/ee_mumu.mad/Source/make_opts   |   1 -
 .../ee_mumu.mad/bin/internal/banner.py        |   6 +-
 .../ee_mumu.mad/bin/internal/launch_plugin.py |  94 ++++++--
 .../bin/internal/madevent_interface.py        |   2 +-
 epochX/cudacpp/ee_mumu.mad/mg5.in             |   2 +-
 .../CODEGEN_cudacpp_ee_mumu_log.txt           |  29 ++-
 .../gg_tt.mad/CODEGEN_mad_gg_tt_log.txt       |  16 +-
 .../gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt    |  25 +-
 .../gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt |  48 ++--
 .../gg_tt01g.mad/Cards/proc_card_mg5.dat      |   2 +-
 .../cudacpp/gg_tt01g.mad/Cards/run_card.dat   |  32 ++-
 .../gg_tt01g.mad/Cards/run_card_default.dat   |  26 ++-
 epochX/cudacpp/gg_tt01g.mad/Source/make_opts  |   1 -
 .../gg_tt01g.mad/bin/internal/banner.py       |   6 +-
 .../bin/internal/launch_plugin.py             |  94 ++++++--
 .../bin/internal/madevent_interface.py        |   2 +-
 epochX/cudacpp/gg_tt01g.mad/mg5.in            |   2 +-
 .../gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt     |  40 ++--
 .../gg_ttg.mad/Cards/proc_card_mg5.dat        |   2 +-
 epochX/cudacpp/gg_ttg.mad/Cards/run_card.dat  |  32 ++-
 .../gg_ttg.mad/Cards/run_card_default.dat     |  26 ++-
 epochX/cudacpp/gg_ttg.mad/Source/make_opts    |   1 -
 .../cudacpp/gg_ttg.mad/bin/internal/banner.py |   6 +-
 .../gg_ttg.mad/bin/internal/launch_plugin.py  |  94 ++++++--
 .../bin/internal/madevent_interface.py        |   2 +-
 epochX/cudacpp/gg_ttg.mad/mg5.in              |   2 +-
 .../gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt  |  29 ++-
 .../gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt   |  40 ++--
 .../gg_ttgg.mad/Cards/proc_card_mg5.dat       |   2 +-
 epochX/cudacpp/gg_ttgg.mad/Cards/run_card.dat |  32 ++-
 .../gg_ttgg.mad/Cards/run_card_default.dat    |  26 ++-
 epochX/cudacpp/gg_ttgg.mad/Source/make_opts   |   1 -
 .../gg_ttgg.mad/bin/internal/banner.py        |   6 +-
 .../gg_ttgg.mad/bin/internal/launch_plugin.py |  94 ++++++--
 .../bin/internal/madevent_interface.py        |   2 +-
 epochX/cudacpp/gg_ttgg.mad/mg5.in             |   2 +-
 .../CODEGEN_cudacpp_gg_ttgg_log.txt           |  25 +-
 .../gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt |  44 ++--
 .../gg_ttggg.mad/Cards/proc_card_mg5.dat      |   2 +-
 .../cudacpp/gg_ttggg.mad/Cards/run_card.dat   |  32 ++-
 .../gg_ttggg.mad/Cards/run_card_default.dat   |  26 ++-
 epochX/cudacpp/gg_ttggg.mad/Source/make_opts  |   1 -
 .../gg_ttggg.mad/bin/internal/banner.py       |   6 +-
 .../bin/internal/launch_plugin.py             |  94 ++++++--
 .../bin/internal/madevent_interface.py        |   2 +-
 epochX/cudacpp/gg_ttggg.mad/mg5.in            |   2 +-
 .../CODEGEN_cudacpp_gg_ttggg_log.txt          |  29 ++-
 .../gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt     |  46 ++--
 .../gq_ttq.mad/Cards/proc_card_mg5.dat        |   2 +-
 epochX/cudacpp/gq_ttq.mad/Cards/run_card.dat  |  32 ++-
 .../gq_ttq.mad/Cards/run_card_default.dat     |  26 ++-
 epochX/cudacpp/gq_ttq.mad/Source/make_opts    |   1 -
 .../cudacpp/gq_ttq.mad/bin/internal/banner.py |   6 +-
 .../gq_ttq.mad/bin/internal/launch_plugin.py  |  94 ++++++--
 .../bin/internal/madevent_interface.py        |   2 +-
 epochX/cudacpp/gq_ttq.mad/mg5.in              |   2 +-
 .../gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt  |  35 ++-
 .../CODEGEN_cudacpp_heft_gg_h_log.txt         |  23 +-
 .../CODEGEN_mad_pp_tt012j_log.txt             | 216 +++++++++---------
 .../pp_tt012j.mad/Cards/proc_card_mg5.dat     |   2 +-
 .../cudacpp/pp_tt012j.mad/Cards/run_card.dat  |  32 ++-
 .../pp_tt012j.mad/Cards/run_card_default.dat  |  26 ++-
 epochX/cudacpp/pp_tt012j.mad/Source/make_opts |   1 -
 .../pp_tt012j.mad/bin/internal/banner.py      |   6 +-
 .../bin/internal/launch_plugin.py             |  94 ++++++--
 .../bin/internal/madevent_interface.py        |   2 +-
 epochX/cudacpp/pp_tt012j.mad/mg5.in           |   2 +-
 71 files changed, 1234 insertions(+), 605 deletions(-)

diff --git a/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt b/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt
index e6546f684c..e090137829 100644
--- a/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt
+++ b/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt
@@ -62,7 +62,7 @@ generate e+ e- > mu+ mu-
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.005372047424316406 [0m
+[1;32mDEBUG: model prefixing  takes 0.005680561065673828 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -156,15 +156,15 @@ INFO: Trying process: e+ e- > mu+ mu- WEIGHTED<=4 @1
 INFO: Process has 2 diagrams 
 1 processes with 2 diagrams generated in 0.004 s
 Total: 1 processes with 2 diagrams
-output madevent ../TMPOUT/CODEGEN_mad_ee_mumu --hel_recycling=False --vector_size=16384 --me_exporter=standalone_cudacpp
+output madevent ../TMPOUT/CODEGEN_mad_ee_mumu --hel_recycling=False --vector_size=32 --me_exporter=standalone_cudacpp
 Load PLUGIN.CUDACPP_OUTPUT
 [1mAddition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1;32mDEBUG:  cformat = [0m standalone_cudacpp [1;30m[export_cpp.py at line 3071][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 155][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 160][0m [0m
 INFO: initialize a new directory: CODEGEN_mad_ee_mumu 
 INFO: remove old information in CODEGEN_mad_ee_mumu 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 160][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 165][0m [0m
 [1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu [0m
 INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu 
 [1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards [0m
@@ -174,7 +174,7 @@ INFO: Generating Helas calls for process: e+ e- > mu+ mu- WEIGHTED<=4 @1
 INFO: Processing color information for process: e+ e- > mu+ mu- @1 
 INFO: Creating files in directory P1_epem_mupmum 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f64657c44f0> [1;30m[export_v4.py at line 6262][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f7ba46f2b80> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -183,27 +183,27 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 [1;32mDEBUG:  config_map = [0m [1, 2] [1;30m[export_cpp.py at line 711][0m [0m
 [1;32mDEBUG:  subproc_number = [0m 0 [1;30m[export_cpp.py at line 712][0m [0m
 [1;32mDEBUG:  Done [1;30m[export_cpp.py at line 713][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 32 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 32 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 32 True 32 [1;30m[export_v4.py at line 1872][0m [0m
 [1;34mWARNING: vector code for lepton pdf not implemented. We removed the option to run dressed lepton [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 32 True 32 [1;30m[export_v4.py at line 1872][0m [0m
 INFO: Generating Feynman diagrams for Process: e+ e- > mu+ mu- WEIGHTED<=4 @1 
 INFO: Finding symmetric diagrams for subprocess group epem_mupmum 
 Generated helas calls for 1 subprocesses (2 diagrams) in 0.004 s
-Wrote files for 8 helas calls in 0.098 s
+Wrote files for 8 helas calls in 0.097 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates FFV2 routines[0m
 ALOHA: aloha creates FFV4 routines[0m
-ALOHA: aloha creates 3 routines in  0.200 s
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 197][0m [0m
+ALOHA: aloha creates 3 routines in  0.199 s
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 202][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates FFV2 routines[0m
 ALOHA: aloha creates FFV4 routines[0m
 ALOHA: aloha creates FFV2_4 routines[0m
-ALOHA: aloha creates 7 routines in  0.255 s
+ALOHA: aloha creates 7 routines in  0.254 s
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV2
@@ -226,7 +226,6 @@ save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CO
 INFO: Use Fortran compiler gfortran 
 INFO: Use c++ compiler g++ 
 INFO: Generate web pages 
-[1;32mDEBUG:  'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) = [0m Entering PLUGIN_ProcessExporter.finalize True <class 'PLUGIN.CUDACPP_OUTPUT.output.PLUGIN_ProcessExporter'> [1;30m[output.py at line 206][0m [0m
 DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common
 patching file Source/genps.inc
 patching file Source/makefile
@@ -242,16 +241,16 @@ patching file matrix1.f
 Hunk #3 succeeded at 230 (offset 9 lines).
 Hunk #4 succeeded at 267 (offset 18 lines).
 Hunk #5 succeeded at 312 (offset 18 lines).
-[1;32mDEBUG:  p.returncode = [0m 0 [1;30m[output.py at line 232][0m [0m
+[1;32mDEBUG:  p.returncode = [0m 0 [1;30m[output.py at line 237][0m [0m
 Output to directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu done.
 Type "launch" to generate events from this process, or see
 /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/README
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m4.853s
-user	0m1.653s
-sys	0m0.201s
+real	0m1.848s
+user	0m1.621s
+sys	0m0.225s
 ************************************************************
 *                                                          *
 *                      W E L C O M E to                    *
@@ -279,10 +278,6 @@ Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 No valid web browser found. Please set in ./input/mg5_configuration.txt
 treatcards run
-[1;34mrun_card missed argument cudacpp_backend. Takes default: CPP[0m
-[1;34mrun_card missed argument cudacpp_backend. Takes default: CPP[0m
-[1;34mrun_card missed argument cudacpp_backend. Takes default: CPP[0m
-[1;34mWARNING! CPPRunCard instance has no attribute path[0m
 quit
 INFO:  
 launch in debug mode
diff --git a/epochX/cudacpp/ee_mumu.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/ee_mumu.mad/Cards/proc_card_mg5.dat
index 618adbca06..22e76563ab 100644
--- a/epochX/cudacpp/ee_mumu.mad/Cards/proc_card_mg5.dat
+++ b/epochX/cudacpp/ee_mumu.mad/Cards/proc_card_mg5.dat
@@ -46,4 +46,4 @@ define l- = e- mu-
 define vl = ve vm vt
 define vl~ = ve~ vm~ vt~
 output madevent ../TMPOUT/CODEGEN_mad_ee_mumu --hel_recycling=False --\
-vector_size=16384 --me_exporter=standalone_cudacpp
+vector_size=32 --me_exporter=standalone_cudacpp
diff --git a/epochX/cudacpp/ee_mumu.mad/Cards/run_card.dat b/epochX/cudacpp/ee_mumu.mad/Cards/run_card.dat
index 86f6a33258..1084532333 100644
--- a/epochX/cudacpp/ee_mumu.mad/Cards/run_card.dat
+++ b/epochX/cudacpp/ee_mumu.mad/Cards/run_card.dat
@@ -85,7 +85,23 @@
    1  = sde_strategy  ! default integration strategy (hep-ph/2021.00773)
                              ! 1 is old strategy (using amp square)
 			     ! 2 is new strategy (using only the denominator)
-# To see advanced option for Phase-Space optimization: type "update psoptim"			     
+#*********************************************************************
+# Phase-Space Optim (advanced)
+#*********************************************************************
+   0 = job_strategy ! see appendix of 1507.00020 (page 26)
+   0 =  hard_survey ! force to have better estimate of the integral at survey for difficult mode like interference
+   -1.0 = tmin_for_channel ! limit the non-singular reach of --some-- channel of integration related to T-channel diagram (value between -1 and 0), -1 is no impact
+   -1 = survey_splitting ! for loop-induced control how many core are used at survey for the computation of a single iteration.
+   2 = survey_nchannel_per_job ! control how many Channel are integrated inside a single job on cluster/multicore
+   -1 = refine_evt_by_job ! control the maximal number of events for the first iteration of the refine (larger means less jobs)  
+#*********************************************************************
+# Compilation flag. 
+#*********************************************************************   
+   -O3 -ffast-math -fbounds-check = global_flag ! build flags for all Fortran code (for a fair comparison to cudacpp; default is -O)
+   --fast-math  = aloha_flag ! fortran optimization flag for aloha function. Suggestions: '-ffast-math'
+   -O3 = matrix_flag ! fortran optimization flag for matrix.f function. Suggestions: '-O3'
+   16384 = vector_size ! size of fortran arrays allocated in the multi-event API for SIMD/GPU (VECSIZE_MEMMAX)
+			     
 #*********************************************************************
 # Customization (custom cuts/scale/bias/...)                         *
 # list of files containing fortran function that overwrite default   *
@@ -182,12 +198,10 @@
 systematics = systematics_program ! none, systematics [python], SysCalc [depreceted, C++]
 ['--mur=0.5,1,2', '--muf=0.5,1,2', '--pdf=errorset'] = systematics_arguments ! see: https://cp3.irmp.ucl.ac.be/projects/madgraph/wiki/Systematics#Systematicspythonmodule
 
-#*********************************************************************
-# Options for the cudacpp plugin
-#*********************************************************************
-
-# Set cudacpp-specific values of non-cudacpp-specific options
--O3 -ffast-math -fbounds-check = global_flag ! build flags for Fortran code (for a fair comparison to cudacpp)
+#***********************************************************************
+# SIMD/GPU configuration for the CUDACPP plugin
+#************************************************************************
+ d = floating_type ! floating point precision: f (single), d (double), m (mixed: double for amplitudes, single for colors)
+ auto = avx_level ! SIMD vectorization level: none, sse4, avx2, 512y, 512z, auto
+ CPP = cudacpp_backend ! CUDACPP backend: FORTRAN, CPP, CUDA
 
-# New cudacpp-specific options (default values are defined in banner.py)
-CPP = cudacpp_backend ! valid backends are FORTRAN, CPP, CUDA
diff --git a/epochX/cudacpp/ee_mumu.mad/Cards/run_card_default.dat b/epochX/cudacpp/ee_mumu.mad/Cards/run_card_default.dat
index 67c1a4de28..0fa0402261 100644
--- a/epochX/cudacpp/ee_mumu.mad/Cards/run_card_default.dat
+++ b/epochX/cudacpp/ee_mumu.mad/Cards/run_card_default.dat
@@ -85,7 +85,23 @@
    2  = sde_strategy  ! default integration strategy (hep-ph/2021.00773)
                              ! 1 is old strategy (using amp square)
 			     ! 2 is new strategy (using only the denominator)
-# To see advanced option for Phase-Space optimization: type "update psoptim"			     
+#*********************************************************************
+# Phase-Space Optim (advanced)
+#*********************************************************************
+   0 = job_strategy ! see appendix of 1507.00020 (page 26)
+   0 =  hard_survey ! force to have better estimate of the integral at survey for difficult mode like interference
+   -1.0 = tmin_for_channel ! limit the non-singular reach of --some-- channel of integration related to T-channel diagram (value between -1 and 0), -1 is no impact
+   -1 = survey_splitting ! for loop-induced control how many core are used at survey for the computation of a single iteration.
+   2 = survey_nchannel_per_job ! control how many Channel are integrated inside a single job on cluster/multicore
+   -1 = refine_evt_by_job ! control the maximal number of events for the first iteration of the refine (larger means less jobs)  
+#*********************************************************************
+# Compilation flag. 
+#*********************************************************************   
+   -O = global_flag ! fortran optimization flag use for the all code.
+   --fast-math  = aloha_flag ! fortran optimization flag for aloha function. Suggestions: '-ffast-math'
+   -O3 = matrix_flag ! fortran optimization flag for matrix.f function. Suggestions: '-O3'
+   16 = vector_size ! size of fortran arrays allocated in the multi-event API for SIMD/GPU (VECSIZE_MEMMAX)
+			     
 #*********************************************************************
 # Customization (custom cuts/scale/bias/...)                         *
 # list of files containing fortran function that overwrite default   *
@@ -181,3 +197,11 @@
 #
 systematics = systematics_program ! none, systematics [python], SysCalc [depreceted, C++]
 ['--mur=0.5,1,2', '--muf=0.5,1,2', '--pdf=errorset'] = systematics_arguments ! see: https://cp3.irmp.ucl.ac.be/projects/madgraph/wiki/Systematics#Systematicspythonmodule
+
+#***********************************************************************
+# SIMD/GPU configuration for the CUDACPP plugin
+#************************************************************************
+ d = floating_type ! floating point precision: f (single), d (double), m (mixed: double for amplitudes, single for colors)
+ auto = avx_level ! SIMD vectorization level: none, sse4, avx2, 512y, 512z, auto
+ CPP = cudacpp_backend ! CUDACPP backend: FORTRAN, CPP, CUDA
+
diff --git a/epochX/cudacpp/ee_mumu.mad/Source/make_opts b/epochX/cudacpp/ee_mumu.mad/Source/make_opts
index 6409e99a49..e4b87ee6ad 100644
--- a/epochX/cudacpp/ee_mumu.mad/Source/make_opts
+++ b/epochX/cudacpp/ee_mumu.mad/Source/make_opts
@@ -1,7 +1,6 @@
 DEFAULT_CPP_COMPILER=g++
 DEFAULT_F2PY_COMPILER=f2py3
 DEFAULT_F_COMPILER=gfortran
-#end_of_make_opts_variables
 GLOBAL_FLAG=-O3 -ffast-math -fbounds-check
 MACFLAG=
 MG5AMC_VERSION=SpecifiedByMG5aMCAtRunTime
diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/banner.py b/epochX/cudacpp/ee_mumu.mad/bin/internal/banner.py
index 3995ce8109..bd1517985f 100755
--- a/epochX/cudacpp/ee_mumu.mad/bin/internal/banner.py
+++ b/epochX/cudacpp/ee_mumu.mad/bin/internal/banner.py
@@ -2822,13 +2822,15 @@ def add_param(self, name, value, fortran_name=None, include=True,
         if fct_mod:
             self.fct_mod[name] = fct_mod
 
-    def read(self, finput, consistency=True, unknown_warning=True):
+    def read(self, finput, consistency=True, unknown_warning=True, **opt):
         """Read the input file, this can be a path to a file, 
            a file object, a str with the content of the file."""
            
         if isinstance(finput, str):
             if "\n" in finput:
                 finput = finput.split('\n')
+                if 'path' in opt:
+                    self.path = opt['path']
             elif os.path.isfile(finput):
                 self.path = finput
                 finput = open(finput)
@@ -3913,7 +3915,7 @@ def remove_all_cut(self):
    %(global_flag)s = global_flag ! fortran optimization flag use for the all code.
    %(aloha_flag)s  = aloha_flag ! fortran optimization flag for aloha function. Suggestions: '-ffast-math'
    %(matrix_flag)s = matrix_flag ! fortran optimization flag for matrix.f function. Suggestions: '-O3'
-   %(vector_size)s = vector_size ! size designed for SIMD/OpenMP/GPU (number of events in lockstep)
+   %(vector_size)s = vector_size ! size of fortran arrays allocated in the multi-event API for SIMD/GPU (VECSIZE_MEMMAX)
 """
 
 template_off = '# To see advanced option for Phase-Space optimization: type "update psoptim"'
diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/ee_mumu.mad/bin/internal/launch_plugin.py
index 0b849330ef..3b09713e12 100644
--- a/epochX/cudacpp/ee_mumu.mad/bin/internal/launch_plugin.py
+++ b/epochX/cudacpp/ee_mumu.mad/bin/internal/launch_plugin.py
@@ -16,22 +16,28 @@
     import internal.misc as misc
     import internal.extended_cmd as extended_cmd
     import internal.banner as banner_mod
+    import internal.common_run_interface as common_run_interface
 else:
     import madgraph.interface.madevent_interface as madevent_interface
     import madgraph.various.misc as misc
     import madgraph.interface.extended_cmd as extended_cmd
     import madgraph.various.banner as banner_mod
+    import madgraph.interface.common_run_interface as common_run_interface
 
 class CPPMEInterface(madevent_interface.MadEventCmdShell):
     def compile(self, *args, **opts):
         """ """
         import multiprocessing
         if not self.options['nb_core'] or self.options['nb_core'] == 'None':
-            self.options['nb_core'] = multiprocessing.cpu_count()
-        if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'):
-            import pathlib
-            import os
-            pjoin = os.path.join
+            self.options['nb_core'] = multiprocessing.cpu_count()    
+        if 'cwd' in opts and os.path.basename(opts['cwd']) == 'Source':
+            path = pjoin(opts['cwd'], 'make_opts')
+            avx_level = self.run_card['avx_level'] if self.run_card['avx_level'] != 'auto' else ''
+            common_run_interface.CommonRunCmd.update_make_opts_full(path,
+                {'FPTYPE': self.run_card['floating_type'],
+                 'AVX': avx_level })
+            misc.sprint('FPTYPE checked')
+        if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'):            
             cudacpp_backend = self.run_card['cudacpp_backend'].upper() # the default value is defined in banner.py
             logger.info("Building madevent in madevent_interface.py with '%s' matrix elements"%cudacpp_backend)
             if cudacpp_backend == 'FORTRAN':
@@ -46,30 +52,83 @@ def compile(self, *args, **opts):
         else:
             return misc.compile(nb_core=self.options['nb_core'], *args, **opts)
 
+# Phase-Space Optimization ------------------------------------------------------------------------------------
+template_on = \
+"""#***********************************************************************
+# SIMD/GPU configuration for the CUDACPP plugin
+#************************************************************************
+ %(floating_type)s = floating_type ! floating point precision: f (single), d (double), m (mixed: double for amplitudes, single for colors)
+ %(avx_level)s = avx_level ! SIMD vectorization level: none, sse4, avx2, 512y, 512z, auto
+ %(cudacpp_backend)s = cudacpp_backend ! CUDACPP backend: FORTRAN, CPP, CUDA
+"""
+
+template_off = ''
+plugin_block = banner_mod.RunBlock('simd', template_on=template_on, template_off=template_off)
+
 class CPPRunCard(banner_mod.RunCardLO):
+    blocks = banner_mod.RunCardLO.blocks + [plugin_block]
+
     def reset_simd(self, old_value, new_value, name):
         if not hasattr(self, 'path'):
-            logger.warning('WARNING! CPPRunCard instance has no attribute path')
-            return
-            ###raise Exception('INTERNAL ERROR! CPPRunCard instance has no attribute path')
+            raise Exception('INTERNAL ERROR! CPPRunCard instance has no attribute path') # now ok after fixing #790
         if name == "vector_size" and new_value <= int(old_value):
             # code can handle the new size -> do not recompile
             return
         Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source')
         subprocess.call(['make', 'cleanavx'], cwd=Sourcedir, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
 
+    def reset_makeopts(self, old_value, new_value, name):
+        if not hasattr(self, 'path'):
+            raise Exception
+        avx_value = self['avx_level'] if self['avx_level'] != 'auto' else ''
+        if name == 'floating_type':
+            common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': new_value, 'AVX': avx_value})
+        elif name == 'avx_level':
+            if new_value == 'auto': new_value = ''
+            common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': self['floating_type'], 'AVX': new_value})
+        else:
+            raise Exception
+        Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source')
+        subprocess.call(['make', 'cleanavx'], cwd=Sourcedir, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+
     def plugin_input(self, finput):
         return
 
     def default_setup(self):
         super().default_setup()
-        self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False)
+        self.add_param('floating_type', 'd', include=False, hidden=False,
+                       fct_mod=(self.reset_makeopts,(),{}),
+                       allowed=['m','d','f'])
+        self.add_param('avx_level', 'auto', include=False, hidden=False,
+                       fct_mod=(self.reset_makeopts,(),{}),
+                       allowed=['auto', 'none', 'sse4', 'avx2','512y','512z'])
+        self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False,
+                       allowed=['Fortan', 'CPP', 'CUDA'])
+        self['vector_size'] = 16 # already setup in default class (just change value)
+        self['aloha_flag'] = '--fast-math'
+        self['matrix_flag'] = '-O3'
+        self.display_block.append('simd')
+        self.display_block.append('psoptim')
 
+    # OM/AV - overload the default version in banner.py
     def write_one_include_file(self, output_dir, incname, output_file=None):
         """write one include file at the time"""
-        if incname == "vector.inc" and 'vector_size' not in self.user_set:
-            return
-        super().write_one_include_file(output_dir, incname, output_file)
+        if incname == "vector.inc":
+            if 'vector_size' not in self.user_set: return
+            if output_file is None: vectorinc=pjoin(output_dir,incname)
+            else: vectorinc=output_file
+            with open(vectorinc+'.new','w') as fileout:
+                with open(vectorinc) as filein:
+                    for line in filein:
+                        if line.startswith('C'): fileout.write(line)
+            super().write_one_include_file(output_dir, incname, output_file)
+            with open(vectorinc+'.new','a') as fileout:
+                with open(vectorinc) as filein:
+                    for line in filein:
+                        if not line.startswith('\n'): fileout.write(line)
+            os.replace(vectorinc+'.new',vectorinc)
+        else:
+            super().write_one_include_file(output_dir, incname, output_file)
 
     def check_validity(self):
         """ensure that PLUGIN information are consistent"""
@@ -82,13 +141,10 @@ def check_validity(self):
 
 class GPURunCard(CPPRunCard):
     def default_setup(self):
-        super(CPPRunCard, self).default_setup()
-        self.add_param('cudacpp_backend', 'CUDA', include=False, hidden=False)
-
-#class CUDACPPRunCard(CPPRunCard):
-#    def default_setup(self):
-#        super(CPPRunCard, self).default_setup()
-#        self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False)
+        super().default_setup()
+        # change default value:
+        self['cudacpp_backend'] = 'CUDA'
+        self['vector_size'] = 16384 # already setup in default class (just change value)
 
 MEINTERFACE = CPPMEInterface
 RunCard = CPPRunCard
diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/madevent_interface.py b/epochX/cudacpp/ee_mumu.mad/bin/internal/madevent_interface.py
index 853aabc98a..cb6bf4ca57 100755
--- a/epochX/cudacpp/ee_mumu.mad/bin/internal/madevent_interface.py
+++ b/epochX/cudacpp/ee_mumu.mad/bin/internal/madevent_interface.py
@@ -3191,7 +3191,7 @@ def do_treatcards(self, line, mode=None, opt=None):
       
         if mode in ['run', 'all']:
             if not hasattr(self, 'run_card'):
-                run_card = banner_mod.RunCard(opt['run_card'])
+                run_card = banner_mod.RunCard(opt['run_card'], path=pjoin(self.me_dir, 'Cards', 'run_card.dat'))
             else:
                 run_card = self.run_card
             self.run_card = run_card
diff --git a/epochX/cudacpp/ee_mumu.mad/mg5.in b/epochX/cudacpp/ee_mumu.mad/mg5.in
index 12a2c58512..4e83015b40 100644
--- a/epochX/cudacpp/ee_mumu.mad/mg5.in
+++ b/epochX/cudacpp/ee_mumu.mad/mg5.in
@@ -1,4 +1,4 @@
 set stdout_level DEBUG
 set zerowidth_tchannel F
 generate e+ e- > mu+ mu-
-output madevent ee_mumu.mad --hel_recycling=False --vector_size=16384 --me_exporter=standalone_cudacpp
+output madevent ee_mumu.mad --hel_recycling=False --vector_size=32 --me_exporter=standalone_cudacpp
diff --git a/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt b/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt
index 8cb80f0d38..3447ea61e3 100644
--- a/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt
+++ b/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt
@@ -62,7 +62,7 @@ generate e+ e- > mu+ mu-
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.005633831024169922 [0m
+[1;32mDEBUG: model prefixing  takes 0.0055027008056640625 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -154,34 +154,34 @@ INFO: Checking for minimal orders which gives processes.
 INFO: Please specify coupling orders to bypass this step. 
 INFO: Trying process: e+ e- > mu+ mu- WEIGHTED<=4 @1  
 INFO: Process has 2 diagrams 
-1 processes with 2 diagrams generated in 0.005 s
+1 processes with 2 diagrams generated in 0.004 s
 Total: 1 processes with 2 diagrams
 output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_ee_mumu
 Load PLUGIN.CUDACPP_OUTPUT
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1;32mDEBUG:  cformat = [0m plugin [1;30m[export_cpp.py at line 3071][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 155][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 160][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 160][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 165][0m [0m
 INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu 
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: e+ e- > mu+ mu- WEIGHTED<=4 @1 
 INFO: Processing color information for process: e+ e- > mu+ mu- @1 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [1;30m[output.py at line 189][0m [0m
-[1;32mDEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [1;30m[output.py at line 190][0m [0m
-[1;32mDEBUG:    type(fortran_model)=<class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 191][0m [0m
-[1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 192][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [1;30m[output.py at line 194][0m [0m
+[1;32mDEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [1;30m[output.py at line 195][0m [0m
+[1;32mDEBUG:    type(fortran_model)=<class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 196][0m [0m
+[1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 197][0m [0m
 INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/./CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/./CPPProcess.cc
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/. 
-Generated helas calls for 1 subprocesses (2 diagrams) in 0.003 s
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 197][0m [0m
+Generated helas calls for 1 subprocesses (2 diagrams) in 0.004 s
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 202][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates FFV2 routines[0m
 ALOHA: aloha creates FFV4 routines[0m
 ALOHA: aloha creates FFV2_4 routines[0m
-ALOHA: aloha creates 4 routines in  0.267 s
+ALOHA: aloha creates 4 routines in  0.270 s
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV2
@@ -198,9 +198,8 @@ FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/./Parameters_sm.cc
 INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory 
 INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/. 
-[1;32mDEBUG:  'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) = [0m Entering PLUGIN_ProcessExporter.finalize False <class 'PLUGIN.CUDACPP_OUTPUT.output.PLUGIN_ProcessExporter'> [1;30m[output.py at line 206][0m [0m
 quit
 
-real	0m3.653s
-user	0m0.601s
-sys	0m0.049s
+real	0m0.657s
+user	0m0.602s
+sys	0m0.050s
diff --git a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt
index 9082dbc054..3326a8488f 100644
--- a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt
+++ b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt
@@ -62,7 +62,7 @@ generate g g > t t~
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.00536799430847168 [0m
+[1;32mDEBUG: model prefixing  takes 0.005394458770751953 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -175,7 +175,7 @@ INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1
 INFO: Processing color information for process: g g > t t~ @1 
 INFO: Creating files in directory P1_gg_ttx 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f8b48a4f9a0> [1;30m[export_v4.py at line 6262][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f5f9e472700> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -191,16 +191,16 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 INFO: Generating Feynman diagrams for Process: g g > t t~ WEIGHTED<=2 @1 
 INFO: Finding symmetric diagrams for subprocess group gg_ttx 
 Generated helas calls for 1 subprocesses (3 diagrams) in 0.006 s
-Wrote files for 10 helas calls in 0.100 s
+Wrote files for 10 helas calls in 0.102 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 set of routines with options: P0[0m
 ALOHA: aloha creates FFV1 routines[0m
-ALOHA: aloha creates 2 routines in  0.143 s
+ALOHA: aloha creates 2 routines in  0.142 s
 [1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 202][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 set of routines with options: P0[0m
 ALOHA: aloha creates FFV1 routines[0m
-ALOHA: aloha creates 4 routines in  0.130 s
+ALOHA: aloha creates 4 routines in  0.138 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -237,9 +237,9 @@ Type "launch" to generate events from this process, or see
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m1.673s
-user	0m1.458s
-sys	0m0.207s
+real	0m1.677s
+user	0m1.453s
+sys	0m0.213s
 ************************************************************
 *                                                          *
 *                      W E L C O M E to                    *
diff --git a/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt b/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt
index 805df19bd9..f09b4fa669 100644
--- a/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt
+++ b/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt
@@ -62,7 +62,7 @@ generate g g > t t~
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.00567626953125 [0m
+[1;32mDEBUG: model prefixing  takes 0.00550079345703125 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -161,26 +161,26 @@ output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_tt
 Load PLUGIN.CUDACPP_OUTPUT
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1;32mDEBUG:  cformat = [0m plugin [1;30m[export_cpp.py at line 3071][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 155][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 160][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 160][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 165][0m [0m
 INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt 
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1 
 INFO: Processing color information for process: g g > t t~ @1 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [1;30m[output.py at line 189][0m [0m
-[1;32mDEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [1;30m[output.py at line 190][0m [0m
-[1;32mDEBUG:    type(fortran_model)=<class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 191][0m [0m
-[1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 192][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [1;30m[output.py at line 194][0m [0m
+[1;32mDEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [1;30m[output.py at line 195][0m [0m
+[1;32mDEBUG:    type(fortran_model)=<class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 196][0m [0m
+[1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 197][0m [0m
 INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/./CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/./CPPProcess.cc
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/. 
 Generated helas calls for 1 subprocesses (3 diagrams) in 0.006 s
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 197][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 202][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 set of routines with options: P0[0m
 ALOHA: aloha creates FFV1 routines[0m
-ALOHA: aloha creates 2 routines in  0.143 s
+ALOHA: aloha creates 2 routines in  0.146 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -193,9 +193,8 @@ FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/./Parameters_sm.cc
 INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory 
 INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. 
-[1;32mDEBUG:  'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) = [0m Entering PLUGIN_ProcessExporter.finalize False <class 'PLUGIN.CUDACPP_OUTPUT.output.PLUGIN_ProcessExporter'> [1;30m[output.py at line 206][0m [0m
 quit
 
-real	0m3.529s
-user	0m0.478s
-sys	0m0.048s
+real	0m0.583s
+user	0m0.466s
+sys	0m0.061s
diff --git a/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt b/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt
index 9d4dbd85f0..acacaf4036 100644
--- a/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt
+++ b/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt
@@ -62,7 +62,7 @@ generate g g > t t~
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.005400419235229492 [0m
+[1;32mDEBUG: model prefixing  takes 0.0055658817291259766 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -165,15 +165,15 @@ INFO: Trying process: g g > t t~ g WEIGHTED<=3 @2
 INFO: Process has 16 diagrams 
 1 processes with 16 diagrams generated in 0.019 s
 Total: 2 processes with 19 diagrams
-output madevent ../TMPOUT/CODEGEN_mad_gg_tt01g --hel_recycling=False --vector_size=16384 --me_exporter=standalone_cudacpp
+output madevent ../TMPOUT/CODEGEN_mad_gg_tt01g --hel_recycling=False --vector_size=32 --me_exporter=standalone_cudacpp
 Load PLUGIN.CUDACPP_OUTPUT
 [1mAddition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1;32mDEBUG:  cformat = [0m standalone_cudacpp [1;30m[export_cpp.py at line 3071][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 155][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 160][0m [0m
 INFO: initialize a new directory: CODEGEN_mad_gg_tt01g 
 INFO: remove old information in CODEGEN_mad_gg_tt01g 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 160][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 165][0m [0m
 [1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g [0m
 INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g 
 [1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards [0m
@@ -185,7 +185,7 @@ INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1
 INFO: Processing color information for process: g g > t t~ @1 
 INFO: Creating files in directory P2_gg_ttxg 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f43c9ec5c70> [1;30m[export_v4.py at line 6262][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fbd240261c0> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -194,15 +194,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 [1;32mDEBUG:  config_map = [0m [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0] [1;30m[export_cpp.py at line 711][0m [0m
 [1;32mDEBUG:  subproc_number = [0m 0 [1;30m[export_cpp.py at line 712][0m [0m
 [1;32mDEBUG:  Done [1;30m[export_cpp.py at line 713][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 32 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 32 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 32 True 32 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 32 True 32 [1;30m[export_v4.py at line 1872][0m [0m
 INFO: Generating Feynman diagrams for Process: g g > t t~ g WEIGHTED<=3 @2 
 INFO: Finding symmetric diagrams for subprocess group gg_ttxg 
 INFO: Creating files in directory P1_gg_ttx 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f43c9ecb2b0> [1;30m[export_v4.py at line 6262][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fbd23fb5fd0> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -211,29 +211,29 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 [1;32mDEBUG:  config_map = [0m [1, 2, 3] [1;30m[export_cpp.py at line 711][0m [0m
 [1;32mDEBUG:  subproc_number = [0m 1 [1;30m[export_cpp.py at line 712][0m [0m
 [1;32mDEBUG:  Done [1;30m[export_cpp.py at line 713][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 32 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 32 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 32 True 32 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 32 True 32 [1;30m[export_v4.py at line 1872][0m [0m
 INFO: Generating Feynman diagrams for Process: g g > t t~ WEIGHTED<=2 @1 
 INFO: Finding symmetric diagrams for subprocess group gg_ttx 
 Generated helas calls for 2 subprocesses (19 diagrams) in 0.042 s
-Wrote files for 46 helas calls in 0.242 s
+Wrote files for 46 helas calls in 0.245 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV3 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV4 set of routines with options: P0[0m
-ALOHA: aloha creates 5 routines in  0.324 s
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 197][0m [0m
+ALOHA: aloha creates 5 routines in  0.321 s
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 202][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV3 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV4 set of routines with options: P0[0m
-ALOHA: aloha creates 10 routines in  0.308 s
+ALOHA: aloha creates 10 routines in  0.304 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -257,7 +257,6 @@ save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CO
 INFO: Use Fortran compiler gfortran 
 INFO: Use c++ compiler g++ 
 INFO: Generate web pages 
-[1;32mDEBUG:  'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) = [0m Entering PLUGIN_ProcessExporter.finalize True <class 'PLUGIN.CUDACPP_OUTPUT.output.PLUGIN_ProcessExporter'> [1;30m[output.py at line 206][0m [0m
 DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common
 patching file Source/genps.inc
 patching file Source/makefile
@@ -277,16 +276,16 @@ Hunk #2 succeeded at 159 (offset 16 lines).
 Hunk #3 succeeded at 237 (offset 16 lines).
 Hunk #4 succeeded at 265 (offset 16 lines).
 Hunk #5 succeeded at 310 (offset 16 lines).
-[1;32mDEBUG:  p.returncode = [0m 0 [1;30m[output.py at line 232][0m [0m
+[1;32mDEBUG:  p.returncode = [0m 0 [1;30m[output.py at line 237][0m [0m
 Output to directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g done.
 Type "launch" to generate events from this process, or see
 /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/README
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m5.282s
-user	0m2.049s
-sys	0m0.227s
+real	0m2.697s
+user	0m2.018s
+sys	0m0.247s
 ************************************************************
 *                                                          *
 *                      W E L C O M E to                    *
@@ -314,9 +313,6 @@ Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 No valid web browser found. Please set in ./input/mg5_configuration.txt
 treatcards run
-[1;34mrun_card missed argument cudacpp_backend. Takes default: CPP[0m
-[1;34mrun_card missed argument cudacpp_backend. Takes default: CPP[0m
-[1;34mWARNING! CPPRunCard instance has no attribute path[0m
 quit
 INFO:  
 launch in debug mode
diff --git a/epochX/cudacpp/gg_tt01g.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/gg_tt01g.mad/Cards/proc_card_mg5.dat
index d0845f65f5..06ea2195d8 100644
--- a/epochX/cudacpp/gg_tt01g.mad/Cards/proc_card_mg5.dat
+++ b/epochX/cudacpp/gg_tt01g.mad/Cards/proc_card_mg5.dat
@@ -47,4 +47,4 @@ define vl = ve vm vt
 define vl~ = ve~ vm~ vt~
 add process g g > t t~ g
 output madevent ../TMPOUT/CODEGEN_mad_gg_tt01g --hel_recycling=False -\
--vector_size=16384 --me_exporter=standalone_cudacpp
+-vector_size=32 --me_exporter=standalone_cudacpp
diff --git a/epochX/cudacpp/gg_tt01g.mad/Cards/run_card.dat b/epochX/cudacpp/gg_tt01g.mad/Cards/run_card.dat
index 3360456835..f7c6502eec 100644
--- a/epochX/cudacpp/gg_tt01g.mad/Cards/run_card.dat
+++ b/epochX/cudacpp/gg_tt01g.mad/Cards/run_card.dat
@@ -98,7 +98,23 @@
    1  = sde_strategy  ! default integration strategy (hep-ph/2021.00773)
                              ! 1 is old strategy (using amp square)
 			     ! 2 is new strategy (using only the denominator)
-# To see advanced option for Phase-Space optimization: type "update psoptim"			     
+#*********************************************************************
+# Phase-Space Optim (advanced)
+#*********************************************************************
+   0 = job_strategy ! see appendix of 1507.00020 (page 26)
+   0 =  hard_survey ! force to have better estimate of the integral at survey for difficult mode like interference
+   -1.0 = tmin_for_channel ! limit the non-singular reach of --some-- channel of integration related to T-channel diagram (value between -1 and 0), -1 is no impact
+   -1 = survey_splitting ! for loop-induced control how many core are used at survey for the computation of a single iteration.
+   2 = survey_nchannel_per_job ! control how many Channel are integrated inside a single job on cluster/multicore
+   -1 = refine_evt_by_job ! control the maximal number of events for the first iteration of the refine (larger means less jobs)  
+#*********************************************************************
+# Compilation flag. 
+#*********************************************************************   
+   -O3 -ffast-math -fbounds-check = global_flag ! build flags for all Fortran code (for a fair comparison to cudacpp; default is -O)
+   --fast-math  = aloha_flag ! fortran optimization flag for aloha function. Suggestions: '-ffast-math'
+   -O3 = matrix_flag ! fortran optimization flag for matrix.f function. Suggestions: '-O3'
+   16384 = vector_size ! size of fortran arrays allocated in the multi-event API for SIMD/GPU (VECSIZE_MEMMAX)
+			     
 #*********************************************************************
 # Customization (custom cuts/scale/bias/...)                         *
 # list of files containing fortran function that overwrite default   *
@@ -161,12 +177,10 @@
 systematics = systematics_program ! none, systematics [python], SysCalc [depreceted, C++]
 ['--mur=0.5,1,2', '--muf=0.5,1,2', '--pdf=errorset', '--alps=0.5,1,2'] = systematics_arguments ! see: https://cp3.irmp.ucl.ac.be/projects/madgraph/wiki/Systematics#Systematicspythonmodule
 
-#*********************************************************************
-# Options for the cudacpp plugin
-#*********************************************************************
-
-# Set cudacpp-specific values of non-cudacpp-specific options
--O3 -ffast-math -fbounds-check = global_flag ! build flags for Fortran code (for a fair comparison to cudacpp)
+#***********************************************************************
+# SIMD/GPU configuration for the CUDACPP plugin
+#************************************************************************
+ d = floating_type ! floating point precision: f (single), d (double), m (mixed: double for amplitudes, single for colors)
+ auto = avx_level ! SIMD vectorization level: none, sse4, avx2, 512y, 512z, auto
+ CPP = cudacpp_backend ! CUDACPP backend: FORTRAN, CPP, CUDA
 
-# New cudacpp-specific options (default values are defined in banner.py)
-CPP = cudacpp_backend ! valid backends are FORTRAN, CPP, CUDA
diff --git a/epochX/cudacpp/gg_tt01g.mad/Cards/run_card_default.dat b/epochX/cudacpp/gg_tt01g.mad/Cards/run_card_default.dat
index 46378139c8..0e45d1a8c9 100644
--- a/epochX/cudacpp/gg_tt01g.mad/Cards/run_card_default.dat
+++ b/epochX/cudacpp/gg_tt01g.mad/Cards/run_card_default.dat
@@ -98,7 +98,23 @@
    2  = sde_strategy  ! default integration strategy (hep-ph/2021.00773)
                              ! 1 is old strategy (using amp square)
 			     ! 2 is new strategy (using only the denominator)
-# To see advanced option for Phase-Space optimization: type "update psoptim"			     
+#*********************************************************************
+# Phase-Space Optim (advanced)
+#*********************************************************************
+   0 = job_strategy ! see appendix of 1507.00020 (page 26)
+   0 =  hard_survey ! force to have better estimate of the integral at survey for difficult mode like interference
+   -1.0 = tmin_for_channel ! limit the non-singular reach of --some-- channel of integration related to T-channel diagram (value between -1 and 0), -1 is no impact
+   -1 = survey_splitting ! for loop-induced control how many core are used at survey for the computation of a single iteration.
+   2 = survey_nchannel_per_job ! control how many Channel are integrated inside a single job on cluster/multicore
+   -1 = refine_evt_by_job ! control the maximal number of events for the first iteration of the refine (larger means less jobs)  
+#*********************************************************************
+# Compilation flag. 
+#*********************************************************************   
+   -O = global_flag ! fortran optimization flag use for the all code.
+   --fast-math  = aloha_flag ! fortran optimization flag for aloha function. Suggestions: '-ffast-math'
+   -O3 = matrix_flag ! fortran optimization flag for matrix.f function. Suggestions: '-O3'
+   16 = vector_size ! size of fortran arrays allocated in the multi-event API for SIMD/GPU (VECSIZE_MEMMAX)
+			     
 #*********************************************************************
 # Customization (custom cuts/scale/bias/...)                         *
 # list of files containing fortran function that overwrite default   *
@@ -160,3 +176,11 @@
 #
 systematics = systematics_program ! none, systematics [python], SysCalc [depreceted, C++]
 ['--mur=0.5,1,2', '--muf=0.5,1,2', '--pdf=errorset', '--alps=0.5,1,2'] = systematics_arguments ! see: https://cp3.irmp.ucl.ac.be/projects/madgraph/wiki/Systematics#Systematicspythonmodule
+
+#***********************************************************************
+# SIMD/GPU configuration for the CUDACPP plugin
+#************************************************************************
+ d = floating_type ! floating point precision: f (single), d (double), m (mixed: double for amplitudes, single for colors)
+ auto = avx_level ! SIMD vectorization level: none, sse4, avx2, 512y, 512z, auto
+ CPP = cudacpp_backend ! CUDACPP backend: FORTRAN, CPP, CUDA
+
diff --git a/epochX/cudacpp/gg_tt01g.mad/Source/make_opts b/epochX/cudacpp/gg_tt01g.mad/Source/make_opts
index 6409e99a49..e4b87ee6ad 100644
--- a/epochX/cudacpp/gg_tt01g.mad/Source/make_opts
+++ b/epochX/cudacpp/gg_tt01g.mad/Source/make_opts
@@ -1,7 +1,6 @@
 DEFAULT_CPP_COMPILER=g++
 DEFAULT_F2PY_COMPILER=f2py3
 DEFAULT_F_COMPILER=gfortran
-#end_of_make_opts_variables
 GLOBAL_FLAG=-O3 -ffast-math -fbounds-check
 MACFLAG=
 MG5AMC_VERSION=SpecifiedByMG5aMCAtRunTime
diff --git a/epochX/cudacpp/gg_tt01g.mad/bin/internal/banner.py b/epochX/cudacpp/gg_tt01g.mad/bin/internal/banner.py
index 3995ce8109..bd1517985f 100755
--- a/epochX/cudacpp/gg_tt01g.mad/bin/internal/banner.py
+++ b/epochX/cudacpp/gg_tt01g.mad/bin/internal/banner.py
@@ -2822,13 +2822,15 @@ def add_param(self, name, value, fortran_name=None, include=True,
         if fct_mod:
             self.fct_mod[name] = fct_mod
 
-    def read(self, finput, consistency=True, unknown_warning=True):
+    def read(self, finput, consistency=True, unknown_warning=True, **opt):
         """Read the input file, this can be a path to a file, 
            a file object, a str with the content of the file."""
            
         if isinstance(finput, str):
             if "\n" in finput:
                 finput = finput.split('\n')
+                if 'path' in opt:
+                    self.path = opt['path']
             elif os.path.isfile(finput):
                 self.path = finput
                 finput = open(finput)
@@ -3913,7 +3915,7 @@ def remove_all_cut(self):
    %(global_flag)s = global_flag ! fortran optimization flag use for the all code.
    %(aloha_flag)s  = aloha_flag ! fortran optimization flag for aloha function. Suggestions: '-ffast-math'
    %(matrix_flag)s = matrix_flag ! fortran optimization flag for matrix.f function. Suggestions: '-O3'
-   %(vector_size)s = vector_size ! size designed for SIMD/OpenMP/GPU (number of events in lockstep)
+   %(vector_size)s = vector_size ! size of fortran arrays allocated in the multi-event API for SIMD/GPU (VECSIZE_MEMMAX)
 """
 
 template_off = '# To see advanced option for Phase-Space optimization: type "update psoptim"'
diff --git a/epochX/cudacpp/gg_tt01g.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/gg_tt01g.mad/bin/internal/launch_plugin.py
index 0b849330ef..3b09713e12 100644
--- a/epochX/cudacpp/gg_tt01g.mad/bin/internal/launch_plugin.py
+++ b/epochX/cudacpp/gg_tt01g.mad/bin/internal/launch_plugin.py
@@ -16,22 +16,28 @@
     import internal.misc as misc
     import internal.extended_cmd as extended_cmd
     import internal.banner as banner_mod
+    import internal.common_run_interface as common_run_interface
 else:
     import madgraph.interface.madevent_interface as madevent_interface
     import madgraph.various.misc as misc
     import madgraph.interface.extended_cmd as extended_cmd
     import madgraph.various.banner as banner_mod
+    import madgraph.interface.common_run_interface as common_run_interface
 
 class CPPMEInterface(madevent_interface.MadEventCmdShell):
     def compile(self, *args, **opts):
         """ """
         import multiprocessing
         if not self.options['nb_core'] or self.options['nb_core'] == 'None':
-            self.options['nb_core'] = multiprocessing.cpu_count()
-        if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'):
-            import pathlib
-            import os
-            pjoin = os.path.join
+            self.options['nb_core'] = multiprocessing.cpu_count()    
+        if 'cwd' in opts and os.path.basename(opts['cwd']) == 'Source':
+            path = pjoin(opts['cwd'], 'make_opts')
+            avx_level = self.run_card['avx_level'] if self.run_card['avx_level'] != 'auto' else ''
+            common_run_interface.CommonRunCmd.update_make_opts_full(path,
+                {'FPTYPE': self.run_card['floating_type'],
+                 'AVX': avx_level })
+            misc.sprint('FPTYPE checked')
+        if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'):            
             cudacpp_backend = self.run_card['cudacpp_backend'].upper() # the default value is defined in banner.py
             logger.info("Building madevent in madevent_interface.py with '%s' matrix elements"%cudacpp_backend)
             if cudacpp_backend == 'FORTRAN':
@@ -46,30 +52,83 @@ def compile(self, *args, **opts):
         else:
             return misc.compile(nb_core=self.options['nb_core'], *args, **opts)
 
+# Phase-Space Optimization ------------------------------------------------------------------------------------
+template_on = \
+"""#***********************************************************************
+# SIMD/GPU configuration for the CUDACPP plugin
+#************************************************************************
+ %(floating_type)s = floating_type ! floating point precision: f (single), d (double), m (mixed: double for amplitudes, single for colors)
+ %(avx_level)s = avx_level ! SIMD vectorization level: none, sse4, avx2, 512y, 512z, auto
+ %(cudacpp_backend)s = cudacpp_backend ! CUDACPP backend: FORTRAN, CPP, CUDA
+"""
+
+template_off = ''
+plugin_block = banner_mod.RunBlock('simd', template_on=template_on, template_off=template_off)
+
 class CPPRunCard(banner_mod.RunCardLO):
+    blocks = banner_mod.RunCardLO.blocks + [plugin_block]
+
     def reset_simd(self, old_value, new_value, name):
         if not hasattr(self, 'path'):
-            logger.warning('WARNING! CPPRunCard instance has no attribute path')
-            return
-            ###raise Exception('INTERNAL ERROR! CPPRunCard instance has no attribute path')
+            raise Exception('INTERNAL ERROR! CPPRunCard instance has no attribute path') # now ok after fixing #790
         if name == "vector_size" and new_value <= int(old_value):
             # code can handle the new size -> do not recompile
             return
         Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source')
         subprocess.call(['make', 'cleanavx'], cwd=Sourcedir, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
 
+    def reset_makeopts(self, old_value, new_value, name):
+        if not hasattr(self, 'path'):
+            raise Exception
+        avx_value = self['avx_level'] if self['avx_level'] != 'auto' else ''
+        if name == 'floating_type':
+            common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': new_value, 'AVX': avx_value})
+        elif name == 'avx_level':
+            if new_value == 'auto': new_value = ''
+            common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': self['floating_type'], 'AVX': new_value})
+        else:
+            raise Exception
+        Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source')
+        subprocess.call(['make', 'cleanavx'], cwd=Sourcedir, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+
     def plugin_input(self, finput):
         return
 
     def default_setup(self):
         super().default_setup()
-        self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False)
+        self.add_param('floating_type', 'd', include=False, hidden=False,
+                       fct_mod=(self.reset_makeopts,(),{}),
+                       allowed=['m','d','f'])
+        self.add_param('avx_level', 'auto', include=False, hidden=False,
+                       fct_mod=(self.reset_makeopts,(),{}),
+                       allowed=['auto', 'none', 'sse4', 'avx2','512y','512z'])
+        self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False,
+                       allowed=['Fortan', 'CPP', 'CUDA'])
+        self['vector_size'] = 16 # already setup in default class (just change value)
+        self['aloha_flag'] = '--fast-math'
+        self['matrix_flag'] = '-O3'
+        self.display_block.append('simd')
+        self.display_block.append('psoptim')
 
+    # OM/AV - overload the default version in banner.py
     def write_one_include_file(self, output_dir, incname, output_file=None):
         """write one include file at the time"""
-        if incname == "vector.inc" and 'vector_size' not in self.user_set:
-            return
-        super().write_one_include_file(output_dir, incname, output_file)
+        if incname == "vector.inc":
+            if 'vector_size' not in self.user_set: return
+            if output_file is None: vectorinc=pjoin(output_dir,incname)
+            else: vectorinc=output_file
+            with open(vectorinc+'.new','w') as fileout:
+                with open(vectorinc) as filein:
+                    for line in filein:
+                        if line.startswith('C'): fileout.write(line)
+            super().write_one_include_file(output_dir, incname, output_file)
+            with open(vectorinc+'.new','a') as fileout:
+                with open(vectorinc) as filein:
+                    for line in filein:
+                        if not line.startswith('\n'): fileout.write(line)
+            os.replace(vectorinc+'.new',vectorinc)
+        else:
+            super().write_one_include_file(output_dir, incname, output_file)
 
     def check_validity(self):
         """ensure that PLUGIN information are consistent"""
@@ -82,13 +141,10 @@ def check_validity(self):
 
 class GPURunCard(CPPRunCard):
     def default_setup(self):
-        super(CPPRunCard, self).default_setup()
-        self.add_param('cudacpp_backend', 'CUDA', include=False, hidden=False)
-
-#class CUDACPPRunCard(CPPRunCard):
-#    def default_setup(self):
-#        super(CPPRunCard, self).default_setup()
-#        self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False)
+        super().default_setup()
+        # change default value:
+        self['cudacpp_backend'] = 'CUDA'
+        self['vector_size'] = 16384 # already setup in default class (just change value)
 
 MEINTERFACE = CPPMEInterface
 RunCard = CPPRunCard
diff --git a/epochX/cudacpp/gg_tt01g.mad/bin/internal/madevent_interface.py b/epochX/cudacpp/gg_tt01g.mad/bin/internal/madevent_interface.py
index 853aabc98a..cb6bf4ca57 100755
--- a/epochX/cudacpp/gg_tt01g.mad/bin/internal/madevent_interface.py
+++ b/epochX/cudacpp/gg_tt01g.mad/bin/internal/madevent_interface.py
@@ -3191,7 +3191,7 @@ def do_treatcards(self, line, mode=None, opt=None):
       
         if mode in ['run', 'all']:
             if not hasattr(self, 'run_card'):
-                run_card = banner_mod.RunCard(opt['run_card'])
+                run_card = banner_mod.RunCard(opt['run_card'], path=pjoin(self.me_dir, 'Cards', 'run_card.dat'))
             else:
                 run_card = self.run_card
             self.run_card = run_card
diff --git a/epochX/cudacpp/gg_tt01g.mad/mg5.in b/epochX/cudacpp/gg_tt01g.mad/mg5.in
index a20e166e81..95984fcf10 100644
--- a/epochX/cudacpp/gg_tt01g.mad/mg5.in
+++ b/epochX/cudacpp/gg_tt01g.mad/mg5.in
@@ -2,4 +2,4 @@ set stdout_level DEBUG
 set zerowidth_tchannel F
 generate g g > t t~
 add process g g > t t~ g
-output madevent gg_tt01g.mad --hel_recycling=False --vector_size=16384 --me_exporter=standalone_cudacpp
+output madevent gg_tt01g.mad --hel_recycling=False --vector_size=32 --me_exporter=standalone_cudacpp
diff --git a/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt b/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt
index 68afa8d9b0..b52dc31122 100644
--- a/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt
+++ b/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt
@@ -62,7 +62,7 @@ generate g g > t t~ g
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.005378007888793945 [0m
+[1;32mDEBUG: model prefixing  takes 0.0053288936614990234 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -157,15 +157,15 @@ INFO: Trying process: g g > t t~ g WEIGHTED<=3 @1
 INFO: Process has 16 diagrams 
 1 processes with 16 diagrams generated in 0.021 s
 Total: 1 processes with 16 diagrams
-output madevent ../TMPOUT/CODEGEN_mad_gg_ttg --hel_recycling=False --vector_size=16384 --me_exporter=standalone_cudacpp
+output madevent ../TMPOUT/CODEGEN_mad_gg_ttg --hel_recycling=False --vector_size=32 --me_exporter=standalone_cudacpp
 Load PLUGIN.CUDACPP_OUTPUT
 [1mAddition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1;32mDEBUG:  cformat = [0m standalone_cudacpp [1;30m[export_cpp.py at line 3071][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 155][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 160][0m [0m
 INFO: initialize a new directory: CODEGEN_mad_gg_ttg 
 INFO: remove old information in CODEGEN_mad_gg_ttg 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 160][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 165][0m [0m
 [1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg [0m
 INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg 
 [1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards [0m
@@ -175,7 +175,7 @@ INFO: Generating Helas calls for process: g g > t t~ g WEIGHTED<=3 @1
 INFO: Processing color information for process: g g > t t~ g @1 
 INFO: Creating files in directory P1_gg_ttxg 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fef242fb1f0> [1;30m[export_v4.py at line 6262][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f705b94f700> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -184,29 +184,29 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 [1;32mDEBUG:  config_map = [0m [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0] [1;30m[export_cpp.py at line 711][0m [0m
 [1;32mDEBUG:  subproc_number = [0m 0 [1;30m[export_cpp.py at line 712][0m [0m
 [1;32mDEBUG:  Done [1;30m[export_cpp.py at line 713][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 32 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 32 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 32 True 32 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 32 True 32 [1;30m[export_v4.py at line 1872][0m [0m
 INFO: Generating Feynman diagrams for Process: g g > t t~ g WEIGHTED<=3 @1 
 INFO: Finding symmetric diagrams for subprocess group gg_ttxg 
-Generated helas calls for 1 subprocesses (16 diagrams) in 0.038 s
-Wrote files for 36 helas calls in 0.148 s
+Generated helas calls for 1 subprocesses (16 diagrams) in 0.037 s
+Wrote files for 36 helas calls in 0.155 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV3 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV4 set of routines with options: P0[0m
-ALOHA: aloha creates 5 routines in  0.323 s
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 197][0m [0m
+ALOHA: aloha creates 5 routines in  0.326 s
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 202][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV3 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV4 set of routines with options: P0[0m
-ALOHA: aloha creates 10 routines in  0.309 s
+ALOHA: aloha creates 10 routines in  0.327 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -230,7 +230,6 @@ save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CO
 INFO: Use Fortran compiler gfortran 
 INFO: Use c++ compiler g++ 
 INFO: Generate web pages 
-[1;32mDEBUG:  'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) = [0m Entering PLUGIN_ProcessExporter.finalize True <class 'PLUGIN.CUDACPP_OUTPUT.output.PLUGIN_ProcessExporter'> [1;30m[output.py at line 206][0m [0m
 DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common
 patching file Source/genps.inc
 patching file Source/makefile
@@ -246,16 +245,16 @@ Hunk #2 succeeded at 159 (offset 16 lines).
 Hunk #3 succeeded at 237 (offset 16 lines).
 Hunk #4 succeeded at 265 (offset 16 lines).
 Hunk #5 succeeded at 310 (offset 16 lines).
-[1;32mDEBUG:  p.returncode = [0m 0 [1;30m[output.py at line 232][0m [0m
+[1;32mDEBUG:  p.returncode = [0m 0 [1;30m[output.py at line 237][0m [0m
 Output to directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg done.
 Type "launch" to generate events from this process, or see
 /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/README
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m5.147s
-user	0m1.924s
-sys	0m0.225s
+real	0m2.189s
+user	0m1.943s
+sys	0m0.223s
 ************************************************************
 *                                                          *
 *                      W E L C O M E to                    *
@@ -283,9 +282,6 @@ Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 No valid web browser found. Please set in ./input/mg5_configuration.txt
 treatcards run
-[1;34mrun_card missed argument cudacpp_backend. Takes default: CPP[0m
-[1;34mrun_card missed argument cudacpp_backend. Takes default: CPP[0m
-[1;34mWARNING! CPPRunCard instance has no attribute path[0m
 quit
 INFO:  
 launch in debug mode
diff --git a/epochX/cudacpp/gg_ttg.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/gg_ttg.mad/Cards/proc_card_mg5.dat
index a0ffbbc219..9d09090869 100644
--- a/epochX/cudacpp/gg_ttg.mad/Cards/proc_card_mg5.dat
+++ b/epochX/cudacpp/gg_ttg.mad/Cards/proc_card_mg5.dat
@@ -46,4 +46,4 @@ define l- = e- mu-
 define vl = ve vm vt
 define vl~ = ve~ vm~ vt~
 output madevent ../TMPOUT/CODEGEN_mad_gg_ttg --hel_recycling=False --v\
-ector_size=16384 --me_exporter=standalone_cudacpp
+ector_size=32 --me_exporter=standalone_cudacpp
diff --git a/epochX/cudacpp/gg_ttg.mad/Cards/run_card.dat b/epochX/cudacpp/gg_ttg.mad/Cards/run_card.dat
index a9fbb5a212..f119b5a1c7 100644
--- a/epochX/cudacpp/gg_ttg.mad/Cards/run_card.dat
+++ b/epochX/cudacpp/gg_ttg.mad/Cards/run_card.dat
@@ -80,7 +80,23 @@
    1  = sde_strategy  ! default integration strategy (hep-ph/2021.00773)
                              ! 1 is old strategy (using amp square)
 			     ! 2 is new strategy (using only the denominator)
-# To see advanced option for Phase-Space optimization: type "update psoptim"			     
+#*********************************************************************
+# Phase-Space Optim (advanced)
+#*********************************************************************
+   0 = job_strategy ! see appendix of 1507.00020 (page 26)
+   0 =  hard_survey ! force to have better estimate of the integral at survey for difficult mode like interference
+   -1.0 = tmin_for_channel ! limit the non-singular reach of --some-- channel of integration related to T-channel diagram (value between -1 and 0), -1 is no impact
+   -1 = survey_splitting ! for loop-induced control how many core are used at survey for the computation of a single iteration.
+   2 = survey_nchannel_per_job ! control how many Channel are integrated inside a single job on cluster/multicore
+   -1 = refine_evt_by_job ! control the maximal number of events for the first iteration of the refine (larger means less jobs)  
+#*********************************************************************
+# Compilation flag. 
+#*********************************************************************   
+   -O3 -ffast-math -fbounds-check = global_flag ! build flags for all Fortran code (for a fair comparison to cudacpp; default is -O)
+   --fast-math  = aloha_flag ! fortran optimization flag for aloha function. Suggestions: '-ffast-math'
+   -O3 = matrix_flag ! fortran optimization flag for matrix.f function. Suggestions: '-O3'
+   16384 = vector_size ! size of fortran arrays allocated in the multi-event API for SIMD/GPU (VECSIZE_MEMMAX)
+			     
 #*********************************************************************
 # Customization (custom cuts/scale/bias/...)                         *
 # list of files containing fortran function that overwrite default   *
@@ -143,12 +159,10 @@
 systematics = systematics_program ! none, systematics [python], SysCalc [depreceted, C++]
 ['--mur=0.5,1,2', '--muf=0.5,1,2', '--pdf=errorset'] = systematics_arguments ! see: https://cp3.irmp.ucl.ac.be/projects/madgraph/wiki/Systematics#Systematicspythonmodule
 
-#*********************************************************************
-# Options for the cudacpp plugin
-#*********************************************************************
-
-# Set cudacpp-specific values of non-cudacpp-specific options
--O3 -ffast-math -fbounds-check = global_flag ! build flags for Fortran code (for a fair comparison to cudacpp)
+#***********************************************************************
+# SIMD/GPU configuration for the CUDACPP plugin
+#************************************************************************
+ d = floating_type ! floating point precision: f (single), d (double), m (mixed: double for amplitudes, single for colors)
+ auto = avx_level ! SIMD vectorization level: none, sse4, avx2, 512y, 512z, auto
+ CPP = cudacpp_backend ! CUDACPP backend: FORTRAN, CPP, CUDA
 
-# New cudacpp-specific options (default values are defined in banner.py)
-CPP = cudacpp_backend ! valid backends are FORTRAN, CPP, CUDA
diff --git a/epochX/cudacpp/gg_ttg.mad/Cards/run_card_default.dat b/epochX/cudacpp/gg_ttg.mad/Cards/run_card_default.dat
index 1464f610ba..e89c78702d 100644
--- a/epochX/cudacpp/gg_ttg.mad/Cards/run_card_default.dat
+++ b/epochX/cudacpp/gg_ttg.mad/Cards/run_card_default.dat
@@ -80,7 +80,23 @@
    2  = sde_strategy  ! default integration strategy (hep-ph/2021.00773)
                              ! 1 is old strategy (using amp square)
 			     ! 2 is new strategy (using only the denominator)
-# To see advanced option for Phase-Space optimization: type "update psoptim"			     
+#*********************************************************************
+# Phase-Space Optim (advanced)
+#*********************************************************************
+   0 = job_strategy ! see appendix of 1507.00020 (page 26)
+   0 =  hard_survey ! force to have better estimate of the integral at survey for difficult mode like interference
+   -1.0 = tmin_for_channel ! limit the non-singular reach of --some-- channel of integration related to T-channel diagram (value between -1 and 0), -1 is no impact
+   -1 = survey_splitting ! for loop-induced control how many core are used at survey for the computation of a single iteration.
+   2 = survey_nchannel_per_job ! control how many Channel are integrated inside a single job on cluster/multicore
+   -1 = refine_evt_by_job ! control the maximal number of events for the first iteration of the refine (larger means less jobs)  
+#*********************************************************************
+# Compilation flag. 
+#*********************************************************************   
+   -O = global_flag ! fortran optimization flag use for the all code.
+   --fast-math  = aloha_flag ! fortran optimization flag for aloha function. Suggestions: '-ffast-math'
+   -O3 = matrix_flag ! fortran optimization flag for matrix.f function. Suggestions: '-O3'
+   16 = vector_size ! size of fortran arrays allocated in the multi-event API for SIMD/GPU (VECSIZE_MEMMAX)
+			     
 #*********************************************************************
 # Customization (custom cuts/scale/bias/...)                         *
 # list of files containing fortran function that overwrite default   *
@@ -142,3 +158,11 @@
 #
 systematics = systematics_program ! none, systematics [python], SysCalc [depreceted, C++]
 ['--mur=0.5,1,2', '--muf=0.5,1,2', '--pdf=errorset'] = systematics_arguments ! see: https://cp3.irmp.ucl.ac.be/projects/madgraph/wiki/Systematics#Systematicspythonmodule
+
+#***********************************************************************
+# SIMD/GPU configuration for the CUDACPP plugin
+#************************************************************************
+ d = floating_type ! floating point precision: f (single), d (double), m (mixed: double for amplitudes, single for colors)
+ auto = avx_level ! SIMD vectorization level: none, sse4, avx2, 512y, 512z, auto
+ CPP = cudacpp_backend ! CUDACPP backend: FORTRAN, CPP, CUDA
+
diff --git a/epochX/cudacpp/gg_ttg.mad/Source/make_opts b/epochX/cudacpp/gg_ttg.mad/Source/make_opts
index 6409e99a49..e4b87ee6ad 100644
--- a/epochX/cudacpp/gg_ttg.mad/Source/make_opts
+++ b/epochX/cudacpp/gg_ttg.mad/Source/make_opts
@@ -1,7 +1,6 @@
 DEFAULT_CPP_COMPILER=g++
 DEFAULT_F2PY_COMPILER=f2py3
 DEFAULT_F_COMPILER=gfortran
-#end_of_make_opts_variables
 GLOBAL_FLAG=-O3 -ffast-math -fbounds-check
 MACFLAG=
 MG5AMC_VERSION=SpecifiedByMG5aMCAtRunTime
diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/banner.py b/epochX/cudacpp/gg_ttg.mad/bin/internal/banner.py
index 3995ce8109..bd1517985f 100755
--- a/epochX/cudacpp/gg_ttg.mad/bin/internal/banner.py
+++ b/epochX/cudacpp/gg_ttg.mad/bin/internal/banner.py
@@ -2822,13 +2822,15 @@ def add_param(self, name, value, fortran_name=None, include=True,
         if fct_mod:
             self.fct_mod[name] = fct_mod
 
-    def read(self, finput, consistency=True, unknown_warning=True):
+    def read(self, finput, consistency=True, unknown_warning=True, **opt):
         """Read the input file, this can be a path to a file, 
            a file object, a str with the content of the file."""
            
         if isinstance(finput, str):
             if "\n" in finput:
                 finput = finput.split('\n')
+                if 'path' in opt:
+                    self.path = opt['path']
             elif os.path.isfile(finput):
                 self.path = finput
                 finput = open(finput)
@@ -3913,7 +3915,7 @@ def remove_all_cut(self):
    %(global_flag)s = global_flag ! fortran optimization flag use for the all code.
    %(aloha_flag)s  = aloha_flag ! fortran optimization flag for aloha function. Suggestions: '-ffast-math'
    %(matrix_flag)s = matrix_flag ! fortran optimization flag for matrix.f function. Suggestions: '-O3'
-   %(vector_size)s = vector_size ! size designed for SIMD/OpenMP/GPU (number of events in lockstep)
+   %(vector_size)s = vector_size ! size of fortran arrays allocated in the multi-event API for SIMD/GPU (VECSIZE_MEMMAX)
 """
 
 template_off = '# To see advanced option for Phase-Space optimization: type "update psoptim"'
diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/gg_ttg.mad/bin/internal/launch_plugin.py
index 0b849330ef..3b09713e12 100644
--- a/epochX/cudacpp/gg_ttg.mad/bin/internal/launch_plugin.py
+++ b/epochX/cudacpp/gg_ttg.mad/bin/internal/launch_plugin.py
@@ -16,22 +16,28 @@
     import internal.misc as misc
     import internal.extended_cmd as extended_cmd
     import internal.banner as banner_mod
+    import internal.common_run_interface as common_run_interface
 else:
     import madgraph.interface.madevent_interface as madevent_interface
     import madgraph.various.misc as misc
     import madgraph.interface.extended_cmd as extended_cmd
     import madgraph.various.banner as banner_mod
+    import madgraph.interface.common_run_interface as common_run_interface
 
 class CPPMEInterface(madevent_interface.MadEventCmdShell):
     def compile(self, *args, **opts):
         """ """
         import multiprocessing
         if not self.options['nb_core'] or self.options['nb_core'] == 'None':
-            self.options['nb_core'] = multiprocessing.cpu_count()
-        if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'):
-            import pathlib
-            import os
-            pjoin = os.path.join
+            self.options['nb_core'] = multiprocessing.cpu_count()    
+        if 'cwd' in opts and os.path.basename(opts['cwd']) == 'Source':
+            path = pjoin(opts['cwd'], 'make_opts')
+            avx_level = self.run_card['avx_level'] if self.run_card['avx_level'] != 'auto' else ''
+            common_run_interface.CommonRunCmd.update_make_opts_full(path,
+                {'FPTYPE': self.run_card['floating_type'],
+                 'AVX': avx_level })
+            misc.sprint('FPTYPE checked')
+        if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'):            
             cudacpp_backend = self.run_card['cudacpp_backend'].upper() # the default value is defined in banner.py
             logger.info("Building madevent in madevent_interface.py with '%s' matrix elements"%cudacpp_backend)
             if cudacpp_backend == 'FORTRAN':
@@ -46,30 +52,83 @@ def compile(self, *args, **opts):
         else:
             return misc.compile(nb_core=self.options['nb_core'], *args, **opts)
 
+# Phase-Space Optimization ------------------------------------------------------------------------------------
+template_on = \
+"""#***********************************************************************
+# SIMD/GPU configuration for the CUDACPP plugin
+#************************************************************************
+ %(floating_type)s = floating_type ! floating point precision: f (single), d (double), m (mixed: double for amplitudes, single for colors)
+ %(avx_level)s = avx_level ! SIMD vectorization level: none, sse4, avx2, 512y, 512z, auto
+ %(cudacpp_backend)s = cudacpp_backend ! CUDACPP backend: FORTRAN, CPP, CUDA
+"""
+
+template_off = ''
+plugin_block = banner_mod.RunBlock('simd', template_on=template_on, template_off=template_off)
+
 class CPPRunCard(banner_mod.RunCardLO):
+    blocks = banner_mod.RunCardLO.blocks + [plugin_block]
+
     def reset_simd(self, old_value, new_value, name):
         if not hasattr(self, 'path'):
-            logger.warning('WARNING! CPPRunCard instance has no attribute path')
-            return
-            ###raise Exception('INTERNAL ERROR! CPPRunCard instance has no attribute path')
+            raise Exception('INTERNAL ERROR! CPPRunCard instance has no attribute path') # now ok after fixing #790
         if name == "vector_size" and new_value <= int(old_value):
             # code can handle the new size -> do not recompile
             return
         Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source')
         subprocess.call(['make', 'cleanavx'], cwd=Sourcedir, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
 
+    def reset_makeopts(self, old_value, new_value, name):
+        if not hasattr(self, 'path'):
+            raise Exception
+        avx_value = self['avx_level'] if self['avx_level'] != 'auto' else ''
+        if name == 'floating_type':
+            common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': new_value, 'AVX': avx_value})
+        elif name == 'avx_level':
+            if new_value == 'auto': new_value = ''
+            common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': self['floating_type'], 'AVX': new_value})
+        else:
+            raise Exception
+        Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source')
+        subprocess.call(['make', 'cleanavx'], cwd=Sourcedir, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+
     def plugin_input(self, finput):
         return
 
     def default_setup(self):
         super().default_setup()
-        self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False)
+        self.add_param('floating_type', 'd', include=False, hidden=False,
+                       fct_mod=(self.reset_makeopts,(),{}),
+                       allowed=['m','d','f'])
+        self.add_param('avx_level', 'auto', include=False, hidden=False,
+                       fct_mod=(self.reset_makeopts,(),{}),
+                       allowed=['auto', 'none', 'sse4', 'avx2','512y','512z'])
+        self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False,
+                       allowed=['Fortan', 'CPP', 'CUDA'])
+        self['vector_size'] = 16 # already setup in default class (just change value)
+        self['aloha_flag'] = '--fast-math'
+        self['matrix_flag'] = '-O3'
+        self.display_block.append('simd')
+        self.display_block.append('psoptim')
 
+    # OM/AV - overload the default version in banner.py
     def write_one_include_file(self, output_dir, incname, output_file=None):
         """write one include file at the time"""
-        if incname == "vector.inc" and 'vector_size' not in self.user_set:
-            return
-        super().write_one_include_file(output_dir, incname, output_file)
+        if incname == "vector.inc":
+            if 'vector_size' not in self.user_set: return
+            if output_file is None: vectorinc=pjoin(output_dir,incname)
+            else: vectorinc=output_file
+            with open(vectorinc+'.new','w') as fileout:
+                with open(vectorinc) as filein:
+                    for line in filein:
+                        if line.startswith('C'): fileout.write(line)
+            super().write_one_include_file(output_dir, incname, output_file)
+            with open(vectorinc+'.new','a') as fileout:
+                with open(vectorinc) as filein:
+                    for line in filein:
+                        if not line.startswith('\n'): fileout.write(line)
+            os.replace(vectorinc+'.new',vectorinc)
+        else:
+            super().write_one_include_file(output_dir, incname, output_file)
 
     def check_validity(self):
         """ensure that PLUGIN information are consistent"""
@@ -82,13 +141,10 @@ def check_validity(self):
 
 class GPURunCard(CPPRunCard):
     def default_setup(self):
-        super(CPPRunCard, self).default_setup()
-        self.add_param('cudacpp_backend', 'CUDA', include=False, hidden=False)
-
-#class CUDACPPRunCard(CPPRunCard):
-#    def default_setup(self):
-#        super(CPPRunCard, self).default_setup()
-#        self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False)
+        super().default_setup()
+        # change default value:
+        self['cudacpp_backend'] = 'CUDA'
+        self['vector_size'] = 16384 # already setup in default class (just change value)
 
 MEINTERFACE = CPPMEInterface
 RunCard = CPPRunCard
diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/madevent_interface.py b/epochX/cudacpp/gg_ttg.mad/bin/internal/madevent_interface.py
index 853aabc98a..cb6bf4ca57 100755
--- a/epochX/cudacpp/gg_ttg.mad/bin/internal/madevent_interface.py
+++ b/epochX/cudacpp/gg_ttg.mad/bin/internal/madevent_interface.py
@@ -3191,7 +3191,7 @@ def do_treatcards(self, line, mode=None, opt=None):
       
         if mode in ['run', 'all']:
             if not hasattr(self, 'run_card'):
-                run_card = banner_mod.RunCard(opt['run_card'])
+                run_card = banner_mod.RunCard(opt['run_card'], path=pjoin(self.me_dir, 'Cards', 'run_card.dat'))
             else:
                 run_card = self.run_card
             self.run_card = run_card
diff --git a/epochX/cudacpp/gg_ttg.mad/mg5.in b/epochX/cudacpp/gg_ttg.mad/mg5.in
index 98f53ce50d..e37d10d865 100644
--- a/epochX/cudacpp/gg_ttg.mad/mg5.in
+++ b/epochX/cudacpp/gg_ttg.mad/mg5.in
@@ -1,4 +1,4 @@
 set stdout_level DEBUG
 set zerowidth_tchannel F
 generate g g > t t~ g
-output madevent gg_ttg.mad --hel_recycling=False --vector_size=16384 --me_exporter=standalone_cudacpp
+output madevent gg_ttg.mad --hel_recycling=False --vector_size=32 --me_exporter=standalone_cudacpp
diff --git a/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt b/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt
index 97056958fe..b9716683be 100644
--- a/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt
+++ b/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt
@@ -62,7 +62,7 @@ generate g g > t t~ g
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.005817890167236328 [0m
+[1;32mDEBUG: model prefixing  takes 0.005490303039550781 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -155,35 +155,35 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=3: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ g WEIGHTED<=3 @1  
 INFO: Process has 16 diagrams 
-1 processes with 16 diagrams generated in 0.022 s
+1 processes with 16 diagrams generated in 0.021 s
 Total: 1 processes with 16 diagrams
 output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_ttg
 Load PLUGIN.CUDACPP_OUTPUT
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1;32mDEBUG:  cformat = [0m plugin [1;30m[export_cpp.py at line 3071][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 155][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 160][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 160][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 165][0m [0m
 INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg 
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: g g > t t~ g WEIGHTED<=3 @1 
 INFO: Processing color information for process: g g > t t~ g @1 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [1;30m[output.py at line 189][0m [0m
-[1;32mDEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [1;30m[output.py at line 190][0m [0m
-[1;32mDEBUG:    type(fortran_model)=<class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 191][0m [0m
-[1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 192][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [1;30m[output.py at line 194][0m [0m
+[1;32mDEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [1;30m[output.py at line 195][0m [0m
+[1;32mDEBUG:    type(fortran_model)=<class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 196][0m [0m
+[1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 197][0m [0m
 INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/./CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/./CPPProcess.cc
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/. 
-Generated helas calls for 1 subprocesses (16 diagrams) in 0.038 s
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 197][0m [0m
+Generated helas calls for 1 subprocesses (16 diagrams) in 0.036 s
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 202][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV3 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV4 set of routines with options: P0[0m
-ALOHA: aloha creates 5 routines in  0.326 s
+ALOHA: aloha creates 5 routines in  0.322 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -201,9 +201,8 @@ FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/./Parameters_sm.cc
 INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory 
 INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/. 
-[1;32mDEBUG:  'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) = [0m Entering PLUGIN_ProcessExporter.finalize False <class 'PLUGIN.CUDACPP_OUTPUT.output.PLUGIN_ProcessExporter'> [1;30m[output.py at line 206][0m [0m
 quit
 
-real	0m3.779s
-user	0m0.713s
-sys	0m0.062s
+real	0m0.784s
+user	0m0.699s
+sys	0m0.063s
diff --git a/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt b/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt
index eacd7a356a..ee3d38dfb1 100644
--- a/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt
+++ b/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt
@@ -62,7 +62,7 @@ generate g g > t t~ g g
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.0053293704986572266 [0m
+[1;32mDEBUG: model prefixing  takes 0.00561833381652832 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -155,17 +155,17 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=4: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ g g WEIGHTED<=4 @1  
 INFO: Process has 123 diagrams 
-1 processes with 123 diagrams generated in 0.157 s
+1 processes with 123 diagrams generated in 0.154 s
 Total: 1 processes with 123 diagrams
-output madevent ../TMPOUT/CODEGEN_mad_gg_ttgg --hel_recycling=False --vector_size=16384 --me_exporter=standalone_cudacpp
+output madevent ../TMPOUT/CODEGEN_mad_gg_ttgg --hel_recycling=False --vector_size=32 --me_exporter=standalone_cudacpp
 Load PLUGIN.CUDACPP_OUTPUT
 [1mAddition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1;32mDEBUG:  cformat = [0m standalone_cudacpp [1;30m[export_cpp.py at line 3071][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 155][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 160][0m [0m
 INFO: initialize a new directory: CODEGEN_mad_gg_ttgg 
 INFO: remove old information in CODEGEN_mad_gg_ttgg 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 160][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 165][0m [0m
 [1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg [0m
 INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg 
 [1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards [0m
@@ -175,7 +175,7 @@ INFO: Generating Helas calls for process: g g > t t~ g g WEIGHTED<=4 @1
 INFO: Processing color information for process: g g > t t~ g g @1 
 INFO: Creating files in directory P1_gg_ttxgg 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f56507006d0> [1;30m[export_v4.py at line 6262][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f38a34ccfa0> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -184,29 +184,29 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 [1;32mDEBUG:  config_map = [0m [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 0, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 0, 46, 47, 48, 49, 50, 51, 52, 53, 54, 0, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 0, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 0, 88, 89, 90, 91, 92, 93, 0, 94, 95, 96, 97, 98, 99, 0, 100, 101, 102, 103, 104, 105, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] [1;30m[export_cpp.py at line 711][0m [0m
 [1;32mDEBUG:  subproc_number = [0m 0 [1;30m[export_cpp.py at line 712][0m [0m
 [1;32mDEBUG:  Done [1;30m[export_cpp.py at line 713][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 32 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 32 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 32 True 32 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 32 True 32 [1;30m[export_v4.py at line 1872][0m [0m
 INFO: Generating Feynman diagrams for Process: g g > t t~ g g WEIGHTED<=4 @1 
 INFO: Finding symmetric diagrams for subprocess group gg_ttxgg 
-Generated helas calls for 1 subprocesses (123 diagrams) in 0.425 s
-Wrote files for 222 helas calls in 0.691 s
+Generated helas calls for 1 subprocesses (123 diagrams) in 0.420 s
+Wrote files for 222 helas calls in 0.679 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 5 routines in  0.333 s
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 197][0m [0m
+ALOHA: aloha creates 5 routines in  0.323 s
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 202][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 10 routines in  0.316 s
+ALOHA: aloha creates 10 routines in  0.314 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -233,7 +233,6 @@ save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CO
 INFO: Use Fortran compiler gfortran 
 INFO: Use c++ compiler g++ 
 INFO: Generate web pages 
-[1;32mDEBUG:  'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) = [0m Entering PLUGIN_ProcessExporter.finalize True <class 'PLUGIN.CUDACPP_OUTPUT.output.PLUGIN_ProcessExporter'> [1;30m[output.py at line 206][0m [0m
 DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common
 patching file Source/genps.inc
 patching file Source/makefile
@@ -249,15 +248,15 @@ Hunk #2 succeeded at 191 (offset 48 lines).
 Hunk #3 succeeded at 269 (offset 48 lines).
 Hunk #4 succeeded at 297 (offset 48 lines).
 Hunk #5 succeeded at 342 (offset 48 lines).
-[1;32mDEBUG:  p.returncode = [0m 0 [1;30m[output.py at line 232][0m [0m
+[1;32mDEBUG:  p.returncode = [0m 0 [1;30m[output.py at line 237][0m [0m
 Output to directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg done.
 Type "launch" to generate events from this process, or see
 /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/README
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m6.262s
-user	0m3.028s
+real	0m3.226s
+user	0m2.976s
 sys	0m0.232s
 ************************************************************
 *                                                          *
@@ -286,9 +285,6 @@ Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 No valid web browser found. Please set in ./input/mg5_configuration.txt
 treatcards run
-[1;34mrun_card missed argument cudacpp_backend. Takes default: CPP[0m
-[1;34mrun_card missed argument cudacpp_backend. Takes default: CPP[0m
-[1;34mWARNING! CPPRunCard instance has no attribute path[0m
 quit
 INFO:  
 launch in debug mode
diff --git a/epochX/cudacpp/gg_ttgg.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/gg_ttgg.mad/Cards/proc_card_mg5.dat
index b7568d1a73..ea9cfcde68 100644
--- a/epochX/cudacpp/gg_ttgg.mad/Cards/proc_card_mg5.dat
+++ b/epochX/cudacpp/gg_ttgg.mad/Cards/proc_card_mg5.dat
@@ -46,4 +46,4 @@ define l- = e- mu-
 define vl = ve vm vt
 define vl~ = ve~ vm~ vt~
 output madevent ../TMPOUT/CODEGEN_mad_gg_ttgg --hel_recycling=False --\
-vector_size=16384 --me_exporter=standalone_cudacpp
+vector_size=32 --me_exporter=standalone_cudacpp
diff --git a/epochX/cudacpp/gg_ttgg.mad/Cards/run_card.dat b/epochX/cudacpp/gg_ttgg.mad/Cards/run_card.dat
index 88d4377d71..42728a48f3 100644
--- a/epochX/cudacpp/gg_ttgg.mad/Cards/run_card.dat
+++ b/epochX/cudacpp/gg_ttgg.mad/Cards/run_card.dat
@@ -80,7 +80,23 @@
    1  = sde_strategy  ! default integration strategy (hep-ph/2021.00773)
                              ! 1 is old strategy (using amp square)
 			     ! 2 is new strategy (using only the denominator)
-# To see advanced option for Phase-Space optimization: type "update psoptim"			     
+#*********************************************************************
+# Phase-Space Optim (advanced)
+#*********************************************************************
+   0 = job_strategy ! see appendix of 1507.00020 (page 26)
+   0 =  hard_survey ! force to have better estimate of the integral at survey for difficult mode like interference
+   -1.0 = tmin_for_channel ! limit the non-singular reach of --some-- channel of integration related to T-channel diagram (value between -1 and 0), -1 is no impact
+   -1 = survey_splitting ! for loop-induced control how many core are used at survey for the computation of a single iteration.
+   2 = survey_nchannel_per_job ! control how many Channel are integrated inside a single job on cluster/multicore
+   -1 = refine_evt_by_job ! control the maximal number of events for the first iteration of the refine (larger means less jobs)  
+#*********************************************************************
+# Compilation flag. 
+#*********************************************************************   
+   -O3 -ffast-math -fbounds-check = global_flag ! build flags for all Fortran code (for a fair comparison to cudacpp; default is -O)
+   --fast-math  = aloha_flag ! fortran optimization flag for aloha function. Suggestions: '-ffast-math'
+   -O3 = matrix_flag ! fortran optimization flag for matrix.f function. Suggestions: '-O3'
+   16384 = vector_size ! size of fortran arrays allocated in the multi-event API for SIMD/GPU (VECSIZE_MEMMAX)
+			     
 #*********************************************************************
 # Customization (custom cuts/scale/bias/...)                         *
 # list of files containing fortran function that overwrite default   *
@@ -168,12 +184,10 @@
 systematics = systematics_program ! none, systematics [python], SysCalc [depreceted, C++]
 ['--mur=0.5,1,2', '--muf=0.5,1,2', '--pdf=errorset'] = systematics_arguments ! see: https://cp3.irmp.ucl.ac.be/projects/madgraph/wiki/Systematics#Systematicspythonmodule
 
-#*********************************************************************
-# Options for the cudacpp plugin
-#*********************************************************************
-
-# Set cudacpp-specific values of non-cudacpp-specific options
--O3 -ffast-math -fbounds-check = global_flag ! build flags for Fortran code (for a fair comparison to cudacpp)
+#***********************************************************************
+# SIMD/GPU configuration for the CUDACPP plugin
+#************************************************************************
+ d = floating_type ! floating point precision: f (single), d (double), m (mixed: double for amplitudes, single for colors)
+ auto = avx_level ! SIMD vectorization level: none, sse4, avx2, 512y, 512z, auto
+ CPP = cudacpp_backend ! CUDACPP backend: FORTRAN, CPP, CUDA
 
-# New cudacpp-specific options (default values are defined in banner.py)
-CPP = cudacpp_backend ! valid backends are FORTRAN, CPP, CUDA
diff --git a/epochX/cudacpp/gg_ttgg.mad/Cards/run_card_default.dat b/epochX/cudacpp/gg_ttgg.mad/Cards/run_card_default.dat
index 26fa7f39dd..0374d7ba60 100644
--- a/epochX/cudacpp/gg_ttgg.mad/Cards/run_card_default.dat
+++ b/epochX/cudacpp/gg_ttgg.mad/Cards/run_card_default.dat
@@ -80,7 +80,23 @@
    2  = sde_strategy  ! default integration strategy (hep-ph/2021.00773)
                              ! 1 is old strategy (using amp square)
 			     ! 2 is new strategy (using only the denominator)
-# To see advanced option for Phase-Space optimization: type "update psoptim"			     
+#*********************************************************************
+# Phase-Space Optim (advanced)
+#*********************************************************************
+   0 = job_strategy ! see appendix of 1507.00020 (page 26)
+   0 =  hard_survey ! force to have better estimate of the integral at survey for difficult mode like interference
+   -1.0 = tmin_for_channel ! limit the non-singular reach of --some-- channel of integration related to T-channel diagram (value between -1 and 0), -1 is no impact
+   -1 = survey_splitting ! for loop-induced control how many core are used at survey for the computation of a single iteration.
+   2 = survey_nchannel_per_job ! control how many Channel are integrated inside a single job on cluster/multicore
+   -1 = refine_evt_by_job ! control the maximal number of events for the first iteration of the refine (larger means less jobs)  
+#*********************************************************************
+# Compilation flag. 
+#*********************************************************************   
+   -O = global_flag ! fortran optimization flag use for the all code.
+   --fast-math  = aloha_flag ! fortran optimization flag for aloha function. Suggestions: '-ffast-math'
+   -O3 = matrix_flag ! fortran optimization flag for matrix.f function. Suggestions: '-O3'
+   16 = vector_size ! size of fortran arrays allocated in the multi-event API for SIMD/GPU (VECSIZE_MEMMAX)
+			     
 #*********************************************************************
 # Customization (custom cuts/scale/bias/...)                         *
 # list of files containing fortran function that overwrite default   *
@@ -167,3 +183,11 @@
 #
 systematics = systematics_program ! none, systematics [python], SysCalc [depreceted, C++]
 ['--mur=0.5,1,2', '--muf=0.5,1,2', '--pdf=errorset'] = systematics_arguments ! see: https://cp3.irmp.ucl.ac.be/projects/madgraph/wiki/Systematics#Systematicspythonmodule
+
+#***********************************************************************
+# SIMD/GPU configuration for the CUDACPP plugin
+#************************************************************************
+ d = floating_type ! floating point precision: f (single), d (double), m (mixed: double for amplitudes, single for colors)
+ auto = avx_level ! SIMD vectorization level: none, sse4, avx2, 512y, 512z, auto
+ CPP = cudacpp_backend ! CUDACPP backend: FORTRAN, CPP, CUDA
+
diff --git a/epochX/cudacpp/gg_ttgg.mad/Source/make_opts b/epochX/cudacpp/gg_ttgg.mad/Source/make_opts
index 6409e99a49..e4b87ee6ad 100644
--- a/epochX/cudacpp/gg_ttgg.mad/Source/make_opts
+++ b/epochX/cudacpp/gg_ttgg.mad/Source/make_opts
@@ -1,7 +1,6 @@
 DEFAULT_CPP_COMPILER=g++
 DEFAULT_F2PY_COMPILER=f2py3
 DEFAULT_F_COMPILER=gfortran
-#end_of_make_opts_variables
 GLOBAL_FLAG=-O3 -ffast-math -fbounds-check
 MACFLAG=
 MG5AMC_VERSION=SpecifiedByMG5aMCAtRunTime
diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/banner.py b/epochX/cudacpp/gg_ttgg.mad/bin/internal/banner.py
index 3995ce8109..bd1517985f 100755
--- a/epochX/cudacpp/gg_ttgg.mad/bin/internal/banner.py
+++ b/epochX/cudacpp/gg_ttgg.mad/bin/internal/banner.py
@@ -2822,13 +2822,15 @@ def add_param(self, name, value, fortran_name=None, include=True,
         if fct_mod:
             self.fct_mod[name] = fct_mod
 
-    def read(self, finput, consistency=True, unknown_warning=True):
+    def read(self, finput, consistency=True, unknown_warning=True, **opt):
         """Read the input file, this can be a path to a file, 
            a file object, a str with the content of the file."""
            
         if isinstance(finput, str):
             if "\n" in finput:
                 finput = finput.split('\n')
+                if 'path' in opt:
+                    self.path = opt['path']
             elif os.path.isfile(finput):
                 self.path = finput
                 finput = open(finput)
@@ -3913,7 +3915,7 @@ def remove_all_cut(self):
    %(global_flag)s = global_flag ! fortran optimization flag use for the all code.
    %(aloha_flag)s  = aloha_flag ! fortran optimization flag for aloha function. Suggestions: '-ffast-math'
    %(matrix_flag)s = matrix_flag ! fortran optimization flag for matrix.f function. Suggestions: '-O3'
-   %(vector_size)s = vector_size ! size designed for SIMD/OpenMP/GPU (number of events in lockstep)
+   %(vector_size)s = vector_size ! size of fortran arrays allocated in the multi-event API for SIMD/GPU (VECSIZE_MEMMAX)
 """
 
 template_off = '# To see advanced option for Phase-Space optimization: type "update psoptim"'
diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/gg_ttgg.mad/bin/internal/launch_plugin.py
index 0b849330ef..3b09713e12 100644
--- a/epochX/cudacpp/gg_ttgg.mad/bin/internal/launch_plugin.py
+++ b/epochX/cudacpp/gg_ttgg.mad/bin/internal/launch_plugin.py
@@ -16,22 +16,28 @@
     import internal.misc as misc
     import internal.extended_cmd as extended_cmd
     import internal.banner as banner_mod
+    import internal.common_run_interface as common_run_interface
 else:
     import madgraph.interface.madevent_interface as madevent_interface
     import madgraph.various.misc as misc
     import madgraph.interface.extended_cmd as extended_cmd
     import madgraph.various.banner as banner_mod
+    import madgraph.interface.common_run_interface as common_run_interface
 
 class CPPMEInterface(madevent_interface.MadEventCmdShell):
     def compile(self, *args, **opts):
         """ """
         import multiprocessing
         if not self.options['nb_core'] or self.options['nb_core'] == 'None':
-            self.options['nb_core'] = multiprocessing.cpu_count()
-        if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'):
-            import pathlib
-            import os
-            pjoin = os.path.join
+            self.options['nb_core'] = multiprocessing.cpu_count()    
+        if 'cwd' in opts and os.path.basename(opts['cwd']) == 'Source':
+            path = pjoin(opts['cwd'], 'make_opts')
+            avx_level = self.run_card['avx_level'] if self.run_card['avx_level'] != 'auto' else ''
+            common_run_interface.CommonRunCmd.update_make_opts_full(path,
+                {'FPTYPE': self.run_card['floating_type'],
+                 'AVX': avx_level })
+            misc.sprint('FPTYPE checked')
+        if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'):            
             cudacpp_backend = self.run_card['cudacpp_backend'].upper() # the default value is defined in banner.py
             logger.info("Building madevent in madevent_interface.py with '%s' matrix elements"%cudacpp_backend)
             if cudacpp_backend == 'FORTRAN':
@@ -46,30 +52,83 @@ def compile(self, *args, **opts):
         else:
             return misc.compile(nb_core=self.options['nb_core'], *args, **opts)
 
+# Phase-Space Optimization ------------------------------------------------------------------------------------
+template_on = \
+"""#***********************************************************************
+# SIMD/GPU configuration for the CUDACPP plugin
+#************************************************************************
+ %(floating_type)s = floating_type ! floating point precision: f (single), d (double), m (mixed: double for amplitudes, single for colors)
+ %(avx_level)s = avx_level ! SIMD vectorization level: none, sse4, avx2, 512y, 512z, auto
+ %(cudacpp_backend)s = cudacpp_backend ! CUDACPP backend: FORTRAN, CPP, CUDA
+"""
+
+template_off = ''
+plugin_block = banner_mod.RunBlock('simd', template_on=template_on, template_off=template_off)
+
 class CPPRunCard(banner_mod.RunCardLO):
+    blocks = banner_mod.RunCardLO.blocks + [plugin_block]
+
     def reset_simd(self, old_value, new_value, name):
         if not hasattr(self, 'path'):
-            logger.warning('WARNING! CPPRunCard instance has no attribute path')
-            return
-            ###raise Exception('INTERNAL ERROR! CPPRunCard instance has no attribute path')
+            raise Exception('INTERNAL ERROR! CPPRunCard instance has no attribute path') # now ok after fixing #790
         if name == "vector_size" and new_value <= int(old_value):
             # code can handle the new size -> do not recompile
             return
         Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source')
         subprocess.call(['make', 'cleanavx'], cwd=Sourcedir, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
 
+    def reset_makeopts(self, old_value, new_value, name):
+        if not hasattr(self, 'path'):
+            raise Exception
+        avx_value = self['avx_level'] if self['avx_level'] != 'auto' else ''
+        if name == 'floating_type':
+            common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': new_value, 'AVX': avx_value})
+        elif name == 'avx_level':
+            if new_value == 'auto': new_value = ''
+            common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': self['floating_type'], 'AVX': new_value})
+        else:
+            raise Exception
+        Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source')
+        subprocess.call(['make', 'cleanavx'], cwd=Sourcedir, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+
     def plugin_input(self, finput):
         return
 
     def default_setup(self):
         super().default_setup()
-        self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False)
+        self.add_param('floating_type', 'd', include=False, hidden=False,
+                       fct_mod=(self.reset_makeopts,(),{}),
+                       allowed=['m','d','f'])
+        self.add_param('avx_level', 'auto', include=False, hidden=False,
+                       fct_mod=(self.reset_makeopts,(),{}),
+                       allowed=['auto', 'none', 'sse4', 'avx2','512y','512z'])
+        self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False,
+                       allowed=['Fortan', 'CPP', 'CUDA'])
+        self['vector_size'] = 16 # already setup in default class (just change value)
+        self['aloha_flag'] = '--fast-math'
+        self['matrix_flag'] = '-O3'
+        self.display_block.append('simd')
+        self.display_block.append('psoptim')
 
+    # OM/AV - overload the default version in banner.py
     def write_one_include_file(self, output_dir, incname, output_file=None):
         """write one include file at the time"""
-        if incname == "vector.inc" and 'vector_size' not in self.user_set:
-            return
-        super().write_one_include_file(output_dir, incname, output_file)
+        if incname == "vector.inc":
+            if 'vector_size' not in self.user_set: return
+            if output_file is None: vectorinc=pjoin(output_dir,incname)
+            else: vectorinc=output_file
+            with open(vectorinc+'.new','w') as fileout:
+                with open(vectorinc) as filein:
+                    for line in filein:
+                        if line.startswith('C'): fileout.write(line)
+            super().write_one_include_file(output_dir, incname, output_file)
+            with open(vectorinc+'.new','a') as fileout:
+                with open(vectorinc) as filein:
+                    for line in filein:
+                        if not line.startswith('\n'): fileout.write(line)
+            os.replace(vectorinc+'.new',vectorinc)
+        else:
+            super().write_one_include_file(output_dir, incname, output_file)
 
     def check_validity(self):
         """ensure that PLUGIN information are consistent"""
@@ -82,13 +141,10 @@ def check_validity(self):
 
 class GPURunCard(CPPRunCard):
     def default_setup(self):
-        super(CPPRunCard, self).default_setup()
-        self.add_param('cudacpp_backend', 'CUDA', include=False, hidden=False)
-
-#class CUDACPPRunCard(CPPRunCard):
-#    def default_setup(self):
-#        super(CPPRunCard, self).default_setup()
-#        self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False)
+        super().default_setup()
+        # change default value:
+        self['cudacpp_backend'] = 'CUDA'
+        self['vector_size'] = 16384 # already setup in default class (just change value)
 
 MEINTERFACE = CPPMEInterface
 RunCard = CPPRunCard
diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/madevent_interface.py b/epochX/cudacpp/gg_ttgg.mad/bin/internal/madevent_interface.py
index 853aabc98a..cb6bf4ca57 100755
--- a/epochX/cudacpp/gg_ttgg.mad/bin/internal/madevent_interface.py
+++ b/epochX/cudacpp/gg_ttgg.mad/bin/internal/madevent_interface.py
@@ -3191,7 +3191,7 @@ def do_treatcards(self, line, mode=None, opt=None):
       
         if mode in ['run', 'all']:
             if not hasattr(self, 'run_card'):
-                run_card = banner_mod.RunCard(opt['run_card'])
+                run_card = banner_mod.RunCard(opt['run_card'], path=pjoin(self.me_dir, 'Cards', 'run_card.dat'))
             else:
                 run_card = self.run_card
             self.run_card = run_card
diff --git a/epochX/cudacpp/gg_ttgg.mad/mg5.in b/epochX/cudacpp/gg_ttgg.mad/mg5.in
index e2c5858b63..53784bf161 100644
--- a/epochX/cudacpp/gg_ttgg.mad/mg5.in
+++ b/epochX/cudacpp/gg_ttgg.mad/mg5.in
@@ -1,4 +1,4 @@
 set stdout_level DEBUG
 set zerowidth_tchannel F
 generate g g > t t~ g g
-output madevent gg_ttgg.mad --hel_recycling=False --vector_size=16384 --me_exporter=standalone_cudacpp
+output madevent gg_ttgg.mad --hel_recycling=False --vector_size=32 --me_exporter=standalone_cudacpp
diff --git a/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt b/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt
index 80631c94bf..d62aabc436 100644
--- a/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt
+++ b/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt
@@ -62,7 +62,7 @@ generate g g > t t~ g g
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.00567317008972168 [0m
+[1;32mDEBUG: model prefixing  takes 0.005473136901855469 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -161,29 +161,29 @@ output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_ttgg
 Load PLUGIN.CUDACPP_OUTPUT
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1;32mDEBUG:  cformat = [0m plugin [1;30m[export_cpp.py at line 3071][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 155][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 160][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 160][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 165][0m [0m
 INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg 
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: g g > t t~ g g WEIGHTED<=4 @1 
 INFO: Processing color information for process: g g > t t~ g g @1 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [1;30m[output.py at line 189][0m [0m
-[1;32mDEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [1;30m[output.py at line 190][0m [0m
-[1;32mDEBUG:    type(fortran_model)=<class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 191][0m [0m
-[1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 192][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [1;30m[output.py at line 194][0m [0m
+[1;32mDEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [1;30m[output.py at line 195][0m [0m
+[1;32mDEBUG:    type(fortran_model)=<class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 196][0m [0m
+[1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 197][0m [0m
 INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/./CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/./CPPProcess.cc
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/. 
-Generated helas calls for 1 subprocesses (123 diagrams) in 0.423 s
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 197][0m [0m
+Generated helas calls for 1 subprocesses (123 diagrams) in 0.420 s
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 202][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 5 routines in  0.318 s
+ALOHA: aloha creates 5 routines in  0.894 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -204,9 +204,8 @@ FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/./Parameters_sm.cc
 INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory 
 INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/. 
-[1;32mDEBUG:  'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) = [0m Entering PLUGIN_ProcessExporter.finalize False <class 'PLUGIN.CUDACPP_OUTPUT.output.PLUGIN_ProcessExporter'> [1;30m[output.py at line 206][0m [0m
 quit
 
-real	0m4.435s
-user	0m1.373s
+real	0m2.023s
+user	0m1.361s
 sys	0m0.056s
diff --git a/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt b/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt
index ab3974344c..d94e7252af 100644
--- a/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt
+++ b/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt
@@ -62,7 +62,7 @@ generate g g > t t~ g g g
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.005319833755493164 [0m
+[1;32mDEBUG: model prefixing  takes 0.005348920822143555 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -155,17 +155,17 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=5: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ g g g WEIGHTED<=5 @1  
 INFO: Process has 1240 diagrams 
-1 processes with 1240 diagrams generated in 1.855 s
+1 processes with 1240 diagrams generated in 1.857 s
 Total: 1 processes with 1240 diagrams
-output madevent ../TMPOUT/CODEGEN_mad_gg_ttggg --hel_recycling=False --vector_size=16384 --me_exporter=standalone_cudacpp
+output madevent ../TMPOUT/CODEGEN_mad_gg_ttggg --hel_recycling=False --vector_size=32 --me_exporter=standalone_cudacpp
 Load PLUGIN.CUDACPP_OUTPUT
 [1mAddition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1;32mDEBUG:  cformat = [0m standalone_cudacpp [1;30m[export_cpp.py at line 3071][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 155][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 160][0m [0m
 INFO: initialize a new directory: CODEGEN_mad_gg_ttggg 
 INFO: remove old information in CODEGEN_mad_gg_ttggg 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 160][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 165][0m [0m
 [1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg [0m
 INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg 
 [1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards [0m
@@ -175,9 +175,9 @@ INFO: Generating Helas calls for process: g g > t t~ g g g WEIGHTED<=5 @1
 INFO: Processing color information for process: g g > t t~ g g g @1 
 INFO: Creating files in directory P1_gg_ttxggg 
 INFO: Computing Color-Flow optimization [15120 term] 
-INFO: Color-Flow passed to 1630 term in 8s. Introduce 3030 contraction 
+INFO: Color-Flow passed to 1630 term in 7s. Introduce 3030 contraction 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fe481da42e0> [1;30m[export_v4.py at line 6262][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f19857558e0> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -186,29 +186,29 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 [1;32mDEBUG:  config_map = [0m [1, 2, 0, 3, 4, 0, 5, 6, 0, 0, 0, 0, 0, 7, 8, 9, 0, 10, 11, 12, 0, 13, 14, 15, 0, 16, 17, 18, 19, 20, 21, 0, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 0, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 0, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 0, 67, 68, 69, 70, 71, 72, 73, 74, 75, 0, 76, 77, 78, 79, 80, 81, 82, 83, 84, 0, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 0, 0, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 0, 121, 122, 0, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 0, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 0, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 0, 197, 198, 199, 200, 201, 202, 0, 203, 204, 205, 206, 207, 208, 0, 209, 210, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 0, 226, 227, 0, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 0, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 0, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 0, 302, 303, 304, 305, 306, 307, 0, 308, 309, 310, 311, 312, 313, 0, 314, 315, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 316, 317, 318, 319, 320, 321, 0, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 0, 337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 0, 352, 353, 354, 355, 356, 357, 358, 359, 360, 361, 362, 363, 364, 365, 366, 0, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 0, 378, 379, 0, 380, 381, 0, 0, 0, 0, 0, 382, 383, 384, 385, 386, 387, 388, 389, 390, 0, 391, 392, 393, 394, 395, 396, 397, 398, 399, 0, 400, 401, 402, 403, 404, 405, 406, 407, 408, 0, 409, 410, 411, 412, 413, 414, 0, 415, 416, 417, 418, 419, 420, 0, 0, 0, 421, 422, 423, 424, 425, 426, 0, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, 0, 442, 443, 444, 445, 446, 447, 448, 449, 450, 451, 452, 453, 454, 455, 456, 0, 457, 458, 459, 460, 461, 462, 463, 464, 465, 466, 467, 468, 469, 470, 471, 0, 472, 473, 474, 475, 476, 477, 478, 479, 480, 481, 482, 0, 483, 484, 0, 485, 486, 0, 0, 0, 0, 0, 487, 488, 489, 490, 491, 492, 493, 494, 495, 0, 496, 497, 498, 499, 500, 501, 502, 503, 504, 0, 505, 506, 507, 508, 509, 510, 511, 512, 513, 0, 514, 515, 516, 517, 518, 519, 0, 520, 521, 522, 523, 524, 525, 0, 0, 0, 526, 527, 528, 529, 530, 531, 0, 532, 533, 534, 535, 536, 537, 538, 539, 540, 541, 542, 543, 544, 545, 546, 0, 547, 548, 549, 550, 551, 552, 553, 554, 555, 556, 557, 558, 559, 560, 561, 0, 562, 563, 564, 565, 566, 567, 568, 569, 570, 571, 572, 573, 574, 575, 576, 0, 577, 578, 579, 580, 581, 582, 583, 584, 585, 586, 587, 0, 588, 589, 0, 590, 591, 0, 0, 0, 0, 0, 592, 593, 594, 595, 596, 597, 598, 599, 600, 0, 601, 602, 603, 604, 605, 606, 607, 608, 609, 0, 610, 611, 612, 613, 614, 615, 616, 617, 618, 0, 619, 620, 621, 622, 623, 624, 0, 625, 626, 627, 628, 629, 630, 0, 0, 0, 631, 632, 633, 634, 635, 636, 637, 638, 639, 640, 641, 642, 643, 644, 645, 646, 647, 648, 649, 650, 651, 652, 653, 654, 655, 656, 657, 658, 659, 660, 661, 662, 663, 0, 664, 665, 666, 667, 668, 669, 0, 670, 671, 672, 673, 674, 675, 0, 0, 0, 676, 677, 678, 679, 680, 681, 682, 683, 684, 685, 686, 687, 688, 689, 690, 691, 692, 693, 694, 695, 696, 697, 698, 699, 700, 701, 702, 703, 704, 705, 706, 707, 708, 0, 709, 710, 711, 712, 713, 714, 0, 715, 716, 717, 718, 719, 720, 0, 0, 0, 721, 722, 0, 723, 724, 0, 725, 726, 0, 0, 0, 0, 0, 727, 728, 729, 730, 731, 732, 733, 734, 735, 0, 736, 737, 738, 739, 740, 741, 742, 743, 744, 0, 745, 746, 747, 748, 749, 750, 751, 752, 753, 0, 754, 755, 756, 757, 758, 759, 0, 760, 761, 762, 763, 764, 765, 766, 767, 0, 768, 769, 0, 770, 771, 0, 0, 0, 0, 0, 772, 773, 774, 775, 776, 777, 778, 779, 780, 0, 781, 782, 783, 784, 785, 786, 787, 788, 789, 0, 790, 791, 792, 793, 794, 795, 796, 797, 798, 0, 799, 800, 801, 802, 803, 804, 0, 805, 806, 807, 808, 809, 810, 811, 812, 0, 813, 814, 0, 815, 816, 0, 0, 0, 0, 0, 817, 818, 819, 820, 821, 822, 823, 824, 825, 0, 826, 827, 828, 829, 830, 831, 832, 833, 834, 0, 835, 836, 837, 838, 839, 840, 841, 842, 843, 0, 844, 845, 846, 847, 848, 849, 0, 850, 851, 852, 853, 854, 855, 856, 857, 0, 858, 859, 0, 860, 861, 0, 0, 0, 0, 862, 863, 0, 864, 865, 0, 866, 867, 0, 0, 0, 0, 868, 869, 0, 870, 871, 0, 872, 873, 0, 0, 0, 0, 0, 0, 0, 874, 875, 876, 877, 878, 879, 880, 881, 882, 883, 884, 885, 886, 887, 888, 889, 890, 891, 0, 892, 893, 894, 895, 896, 897, 898, 899, 900, 901, 902, 903, 904, 905, 906, 907, 908, 909, 0, 910, 911, 912, 913, 914, 915, 916, 917, 918, 919, 920, 921, 922, 923, 924, 925, 926, 927, 0, 928, 929, 930, 931, 932, 933, 0, 934, 935, 936, 937, 938, 939, 0, 940, 941, 942, 943, 944, 945, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] [1;30m[export_cpp.py at line 711][0m [0m
 [1;32mDEBUG:  subproc_number = [0m 0 [1;30m[export_cpp.py at line 712][0m [0m
 [1;32mDEBUG:  Done [1;30m[export_cpp.py at line 713][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 32 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 32 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 32 True 32 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 32 True 32 [1;30m[export_v4.py at line 1872][0m [0m
 INFO: Generating Feynman diagrams for Process: g g > t t~ g g g WEIGHTED<=5 @1 
 INFO: Finding symmetric diagrams for subprocess group gg_ttxggg 
-Generated helas calls for 1 subprocesses (1240 diagrams) in 6.574 s
-Wrote files for 2281 helas calls in 18.431 s
+Generated helas calls for 1 subprocesses (1240 diagrams) in 6.607 s
+Wrote files for 2281 helas calls in 18.169 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 5 routines in  0.335 s
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 197][0m [0m
+ALOHA: aloha creates 5 routines in  0.317 s
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 202][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 10 routines in  0.313 s
+ALOHA: aloha creates 10 routines in  0.308 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -235,7 +235,6 @@ save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CO
 INFO: Use Fortran compiler gfortran 
 INFO: Use c++ compiler g++ 
 INFO: Generate web pages 
-[1;32mDEBUG:  'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) = [0m Entering PLUGIN_ProcessExporter.finalize True <class 'PLUGIN.CUDACPP_OUTPUT.output.PLUGIN_ProcessExporter'> [1;30m[output.py at line 206][0m [0m
 DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common
 patching file Source/genps.inc
 patching file Source/makefile
@@ -251,16 +250,16 @@ Hunk #2 succeeded at 255 (offset 112 lines).
 Hunk #3 succeeded at 333 (offset 112 lines).
 Hunk #4 succeeded at 361 (offset 112 lines).
 Hunk #5 succeeded at 406 (offset 112 lines).
-[1;32mDEBUG:  p.returncode = [0m 0 [1;30m[output.py at line 232][0m [0m
+[1;32mDEBUG:  p.returncode = [0m 0 [1;30m[output.py at line 237][0m [0m
 Output to directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg done.
 Type "launch" to generate events from this process, or see
 /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/README
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m32.103s
-user	0m28.586s
-sys	0m0.412s
+real	0m28.894s
+user	0m28.357s
+sys	0m0.382s
 ************************************************************
 *                                                          *
 *                      W E L C O M E to                    *
@@ -288,9 +287,6 @@ Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 No valid web browser found. Please set in ./input/mg5_configuration.txt
 treatcards run
-[1;34mrun_card missed argument cudacpp_backend. Takes default: CPP[0m
-[1;34mrun_card missed argument cudacpp_backend. Takes default: CPP[0m
-[1;34mWARNING! CPPRunCard instance has no attribute path[0m
 quit
 INFO:  
 launch in debug mode
diff --git a/epochX/cudacpp/gg_ttggg.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/gg_ttggg.mad/Cards/proc_card_mg5.dat
index 2f92ecc4ba..3923568dd8 100644
--- a/epochX/cudacpp/gg_ttggg.mad/Cards/proc_card_mg5.dat
+++ b/epochX/cudacpp/gg_ttggg.mad/Cards/proc_card_mg5.dat
@@ -46,4 +46,4 @@ define l- = e- mu-
 define vl = ve vm vt
 define vl~ = ve~ vm~ vt~
 output madevent ../TMPOUT/CODEGEN_mad_gg_ttggg --hel_recycling=False -\
--vector_size=16384 --me_exporter=standalone_cudacpp
+-vector_size=32 --me_exporter=standalone_cudacpp
diff --git a/epochX/cudacpp/gg_ttggg.mad/Cards/run_card.dat b/epochX/cudacpp/gg_ttggg.mad/Cards/run_card.dat
index 0a6bf20eb9..8170176a11 100644
--- a/epochX/cudacpp/gg_ttggg.mad/Cards/run_card.dat
+++ b/epochX/cudacpp/gg_ttggg.mad/Cards/run_card.dat
@@ -80,7 +80,23 @@
    1  = sde_strategy  ! default integration strategy (hep-ph/2021.00773)
                              ! 1 is old strategy (using amp square)
 			     ! 2 is new strategy (using only the denominator)
-# To see advanced option for Phase-Space optimization: type "update psoptim"			     
+#*********************************************************************
+# Phase-Space Optim (advanced)
+#*********************************************************************
+   0 = job_strategy ! see appendix of 1507.00020 (page 26)
+   0 =  hard_survey ! force to have better estimate of the integral at survey for difficult mode like interference
+   -1.0 = tmin_for_channel ! limit the non-singular reach of --some-- channel of integration related to T-channel diagram (value between -1 and 0), -1 is no impact
+   -1 = survey_splitting ! for loop-induced control how many core are used at survey for the computation of a single iteration.
+   2 = survey_nchannel_per_job ! control how many Channel are integrated inside a single job on cluster/multicore
+   -1 = refine_evt_by_job ! control the maximal number of events for the first iteration of the refine (larger means less jobs)  
+#*********************************************************************
+# Compilation flag. 
+#*********************************************************************   
+   -O3 -ffast-math -fbounds-check = global_flag ! build flags for all Fortran code (for a fair comparison to cudacpp; default is -O)
+   --fast-math  = aloha_flag ! fortran optimization flag for aloha function. Suggestions: '-ffast-math'
+   -O3 = matrix_flag ! fortran optimization flag for matrix.f function. Suggestions: '-O3'
+   16384 = vector_size ! size of fortran arrays allocated in the multi-event API for SIMD/GPU (VECSIZE_MEMMAX)
+			     
 #*********************************************************************
 # Customization (custom cuts/scale/bias/...)                         *
 # list of files containing fortran function that overwrite default   *
@@ -174,12 +190,10 @@
 systematics = systematics_program ! none, systematics [python], SysCalc [depreceted, C++]
 ['--mur=0.5,1,2', '--muf=0.5,1,2', '--pdf=errorset'] = systematics_arguments ! see: https://cp3.irmp.ucl.ac.be/projects/madgraph/wiki/Systematics#Systematicspythonmodule
 
-#*********************************************************************
-# Options for the cudacpp plugin
-#*********************************************************************
-
-# Set cudacpp-specific values of non-cudacpp-specific options
--O3 -ffast-math -fbounds-check = global_flag ! build flags for Fortran code (for a fair comparison to cudacpp)
+#***********************************************************************
+# SIMD/GPU configuration for the CUDACPP plugin
+#************************************************************************
+ d = floating_type ! floating point precision: f (single), d (double), m (mixed: double for amplitudes, single for colors)
+ auto = avx_level ! SIMD vectorization level: none, sse4, avx2, 512y, 512z, auto
+ CPP = cudacpp_backend ! CUDACPP backend: FORTRAN, CPP, CUDA
 
-# New cudacpp-specific options (default values are defined in banner.py)
-CPP = cudacpp_backend ! valid backends are FORTRAN, CPP, CUDA
diff --git a/epochX/cudacpp/gg_ttggg.mad/Cards/run_card_default.dat b/epochX/cudacpp/gg_ttggg.mad/Cards/run_card_default.dat
index 3714e71997..8da9ac1563 100644
--- a/epochX/cudacpp/gg_ttggg.mad/Cards/run_card_default.dat
+++ b/epochX/cudacpp/gg_ttggg.mad/Cards/run_card_default.dat
@@ -80,7 +80,23 @@
    2  = sde_strategy  ! default integration strategy (hep-ph/2021.00773)
                              ! 1 is old strategy (using amp square)
 			     ! 2 is new strategy (using only the denominator)
-# To see advanced option for Phase-Space optimization: type "update psoptim"			     
+#*********************************************************************
+# Phase-Space Optim (advanced)
+#*********************************************************************
+   0 = job_strategy ! see appendix of 1507.00020 (page 26)
+   0 =  hard_survey ! force to have better estimate of the integral at survey for difficult mode like interference
+   -1.0 = tmin_for_channel ! limit the non-singular reach of --some-- channel of integration related to T-channel diagram (value between -1 and 0), -1 is no impact
+   -1 = survey_splitting ! for loop-induced control how many core are used at survey for the computation of a single iteration.
+   2 = survey_nchannel_per_job ! control how many Channel are integrated inside a single job on cluster/multicore
+   -1 = refine_evt_by_job ! control the maximal number of events for the first iteration of the refine (larger means less jobs)  
+#*********************************************************************
+# Compilation flag. 
+#*********************************************************************   
+   -O = global_flag ! fortran optimization flag use for the all code.
+   --fast-math  = aloha_flag ! fortran optimization flag for aloha function. Suggestions: '-ffast-math'
+   -O3 = matrix_flag ! fortran optimization flag for matrix.f function. Suggestions: '-O3'
+   16 = vector_size ! size of fortran arrays allocated in the multi-event API for SIMD/GPU (VECSIZE_MEMMAX)
+			     
 #*********************************************************************
 # Customization (custom cuts/scale/bias/...)                         *
 # list of files containing fortran function that overwrite default   *
@@ -173,3 +189,11 @@
 #
 systematics = systematics_program ! none, systematics [python], SysCalc [depreceted, C++]
 ['--mur=0.5,1,2', '--muf=0.5,1,2', '--pdf=errorset'] = systematics_arguments ! see: https://cp3.irmp.ucl.ac.be/projects/madgraph/wiki/Systematics#Systematicspythonmodule
+
+#***********************************************************************
+# SIMD/GPU configuration for the CUDACPP plugin
+#************************************************************************
+ d = floating_type ! floating point precision: f (single), d (double), m (mixed: double for amplitudes, single for colors)
+ auto = avx_level ! SIMD vectorization level: none, sse4, avx2, 512y, 512z, auto
+ CPP = cudacpp_backend ! CUDACPP backend: FORTRAN, CPP, CUDA
+
diff --git a/epochX/cudacpp/gg_ttggg.mad/Source/make_opts b/epochX/cudacpp/gg_ttggg.mad/Source/make_opts
index 6409e99a49..e4b87ee6ad 100644
--- a/epochX/cudacpp/gg_ttggg.mad/Source/make_opts
+++ b/epochX/cudacpp/gg_ttggg.mad/Source/make_opts
@@ -1,7 +1,6 @@
 DEFAULT_CPP_COMPILER=g++
 DEFAULT_F2PY_COMPILER=f2py3
 DEFAULT_F_COMPILER=gfortran
-#end_of_make_opts_variables
 GLOBAL_FLAG=-O3 -ffast-math -fbounds-check
 MACFLAG=
 MG5AMC_VERSION=SpecifiedByMG5aMCAtRunTime
diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/banner.py b/epochX/cudacpp/gg_ttggg.mad/bin/internal/banner.py
index 3995ce8109..bd1517985f 100755
--- a/epochX/cudacpp/gg_ttggg.mad/bin/internal/banner.py
+++ b/epochX/cudacpp/gg_ttggg.mad/bin/internal/banner.py
@@ -2822,13 +2822,15 @@ def add_param(self, name, value, fortran_name=None, include=True,
         if fct_mod:
             self.fct_mod[name] = fct_mod
 
-    def read(self, finput, consistency=True, unknown_warning=True):
+    def read(self, finput, consistency=True, unknown_warning=True, **opt):
         """Read the input file, this can be a path to a file, 
            a file object, a str with the content of the file."""
            
         if isinstance(finput, str):
             if "\n" in finput:
                 finput = finput.split('\n')
+                if 'path' in opt:
+                    self.path = opt['path']
             elif os.path.isfile(finput):
                 self.path = finput
                 finput = open(finput)
@@ -3913,7 +3915,7 @@ def remove_all_cut(self):
    %(global_flag)s = global_flag ! fortran optimization flag use for the all code.
    %(aloha_flag)s  = aloha_flag ! fortran optimization flag for aloha function. Suggestions: '-ffast-math'
    %(matrix_flag)s = matrix_flag ! fortran optimization flag for matrix.f function. Suggestions: '-O3'
-   %(vector_size)s = vector_size ! size designed for SIMD/OpenMP/GPU (number of events in lockstep)
+   %(vector_size)s = vector_size ! size of fortran arrays allocated in the multi-event API for SIMD/GPU (VECSIZE_MEMMAX)
 """
 
 template_off = '# To see advanced option for Phase-Space optimization: type "update psoptim"'
diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/gg_ttggg.mad/bin/internal/launch_plugin.py
index 0b849330ef..3b09713e12 100644
--- a/epochX/cudacpp/gg_ttggg.mad/bin/internal/launch_plugin.py
+++ b/epochX/cudacpp/gg_ttggg.mad/bin/internal/launch_plugin.py
@@ -16,22 +16,28 @@
     import internal.misc as misc
     import internal.extended_cmd as extended_cmd
     import internal.banner as banner_mod
+    import internal.common_run_interface as common_run_interface
 else:
     import madgraph.interface.madevent_interface as madevent_interface
     import madgraph.various.misc as misc
     import madgraph.interface.extended_cmd as extended_cmd
     import madgraph.various.banner as banner_mod
+    import madgraph.interface.common_run_interface as common_run_interface
 
 class CPPMEInterface(madevent_interface.MadEventCmdShell):
     def compile(self, *args, **opts):
         """ """
         import multiprocessing
         if not self.options['nb_core'] or self.options['nb_core'] == 'None':
-            self.options['nb_core'] = multiprocessing.cpu_count()
-        if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'):
-            import pathlib
-            import os
-            pjoin = os.path.join
+            self.options['nb_core'] = multiprocessing.cpu_count()    
+        if 'cwd' in opts and os.path.basename(opts['cwd']) == 'Source':
+            path = pjoin(opts['cwd'], 'make_opts')
+            avx_level = self.run_card['avx_level'] if self.run_card['avx_level'] != 'auto' else ''
+            common_run_interface.CommonRunCmd.update_make_opts_full(path,
+                {'FPTYPE': self.run_card['floating_type'],
+                 'AVX': avx_level })
+            misc.sprint('FPTYPE checked')
+        if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'):            
             cudacpp_backend = self.run_card['cudacpp_backend'].upper() # the default value is defined in banner.py
             logger.info("Building madevent in madevent_interface.py with '%s' matrix elements"%cudacpp_backend)
             if cudacpp_backend == 'FORTRAN':
@@ -46,30 +52,83 @@ def compile(self, *args, **opts):
         else:
             return misc.compile(nb_core=self.options['nb_core'], *args, **opts)
 
+# Phase-Space Optimization ------------------------------------------------------------------------------------
+template_on = \
+"""#***********************************************************************
+# SIMD/GPU configuration for the CUDACPP plugin
+#************************************************************************
+ %(floating_type)s = floating_type ! floating point precision: f (single), d (double), m (mixed: double for amplitudes, single for colors)
+ %(avx_level)s = avx_level ! SIMD vectorization level: none, sse4, avx2, 512y, 512z, auto
+ %(cudacpp_backend)s = cudacpp_backend ! CUDACPP backend: FORTRAN, CPP, CUDA
+"""
+
+template_off = ''
+plugin_block = banner_mod.RunBlock('simd', template_on=template_on, template_off=template_off)
+
 class CPPRunCard(banner_mod.RunCardLO):
+    blocks = banner_mod.RunCardLO.blocks + [plugin_block]
+
     def reset_simd(self, old_value, new_value, name):
         if not hasattr(self, 'path'):
-            logger.warning('WARNING! CPPRunCard instance has no attribute path')
-            return
-            ###raise Exception('INTERNAL ERROR! CPPRunCard instance has no attribute path')
+            raise Exception('INTERNAL ERROR! CPPRunCard instance has no attribute path') # now ok after fixing #790
         if name == "vector_size" and new_value <= int(old_value):
             # code can handle the new size -> do not recompile
             return
         Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source')
         subprocess.call(['make', 'cleanavx'], cwd=Sourcedir, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
 
+    def reset_makeopts(self, old_value, new_value, name):
+        if not hasattr(self, 'path'):
+            raise Exception
+        avx_value = self['avx_level'] if self['avx_level'] != 'auto' else ''
+        if name == 'floating_type':
+            common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': new_value, 'AVX': avx_value})
+        elif name == 'avx_level':
+            if new_value == 'auto': new_value = ''
+            common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': self['floating_type'], 'AVX': new_value})
+        else:
+            raise Exception
+        Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source')
+        subprocess.call(['make', 'cleanavx'], cwd=Sourcedir, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+
     def plugin_input(self, finput):
         return
 
     def default_setup(self):
         super().default_setup()
-        self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False)
+        self.add_param('floating_type', 'd', include=False, hidden=False,
+                       fct_mod=(self.reset_makeopts,(),{}),
+                       allowed=['m','d','f'])
+        self.add_param('avx_level', 'auto', include=False, hidden=False,
+                       fct_mod=(self.reset_makeopts,(),{}),
+                       allowed=['auto', 'none', 'sse4', 'avx2','512y','512z'])
+        self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False,
+                       allowed=['Fortan', 'CPP', 'CUDA'])
+        self['vector_size'] = 16 # already setup in default class (just change value)
+        self['aloha_flag'] = '--fast-math'
+        self['matrix_flag'] = '-O3'
+        self.display_block.append('simd')
+        self.display_block.append('psoptim')
 
+    # OM/AV - overload the default version in banner.py
     def write_one_include_file(self, output_dir, incname, output_file=None):
         """write one include file at the time"""
-        if incname == "vector.inc" and 'vector_size' not in self.user_set:
-            return
-        super().write_one_include_file(output_dir, incname, output_file)
+        if incname == "vector.inc":
+            if 'vector_size' not in self.user_set: return
+            if output_file is None: vectorinc=pjoin(output_dir,incname)
+            else: vectorinc=output_file
+            with open(vectorinc+'.new','w') as fileout:
+                with open(vectorinc) as filein:
+                    for line in filein:
+                        if line.startswith('C'): fileout.write(line)
+            super().write_one_include_file(output_dir, incname, output_file)
+            with open(vectorinc+'.new','a') as fileout:
+                with open(vectorinc) as filein:
+                    for line in filein:
+                        if not line.startswith('\n'): fileout.write(line)
+            os.replace(vectorinc+'.new',vectorinc)
+        else:
+            super().write_one_include_file(output_dir, incname, output_file)
 
     def check_validity(self):
         """ensure that PLUGIN information are consistent"""
@@ -82,13 +141,10 @@ def check_validity(self):
 
 class GPURunCard(CPPRunCard):
     def default_setup(self):
-        super(CPPRunCard, self).default_setup()
-        self.add_param('cudacpp_backend', 'CUDA', include=False, hidden=False)
-
-#class CUDACPPRunCard(CPPRunCard):
-#    def default_setup(self):
-#        super(CPPRunCard, self).default_setup()
-#        self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False)
+        super().default_setup()
+        # change default value:
+        self['cudacpp_backend'] = 'CUDA'
+        self['vector_size'] = 16384 # already setup in default class (just change value)
 
 MEINTERFACE = CPPMEInterface
 RunCard = CPPRunCard
diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/madevent_interface.py b/epochX/cudacpp/gg_ttggg.mad/bin/internal/madevent_interface.py
index 853aabc98a..cb6bf4ca57 100755
--- a/epochX/cudacpp/gg_ttggg.mad/bin/internal/madevent_interface.py
+++ b/epochX/cudacpp/gg_ttggg.mad/bin/internal/madevent_interface.py
@@ -3191,7 +3191,7 @@ def do_treatcards(self, line, mode=None, opt=None):
       
         if mode in ['run', 'all']:
             if not hasattr(self, 'run_card'):
-                run_card = banner_mod.RunCard(opt['run_card'])
+                run_card = banner_mod.RunCard(opt['run_card'], path=pjoin(self.me_dir, 'Cards', 'run_card.dat'))
             else:
                 run_card = self.run_card
             self.run_card = run_card
diff --git a/epochX/cudacpp/gg_ttggg.mad/mg5.in b/epochX/cudacpp/gg_ttggg.mad/mg5.in
index cdbc845cdd..f92d17d219 100644
--- a/epochX/cudacpp/gg_ttggg.mad/mg5.in
+++ b/epochX/cudacpp/gg_ttggg.mad/mg5.in
@@ -1,4 +1,4 @@
 set stdout_level DEBUG
 set zerowidth_tchannel F
 generate g g > t t~ g g g
-output madevent gg_ttggg.mad --hel_recycling=False --vector_size=16384 --me_exporter=standalone_cudacpp
+output madevent gg_ttggg.mad --hel_recycling=False --vector_size=32 --me_exporter=standalone_cudacpp
diff --git a/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt b/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt
index 33bae20142..8660fec52a 100644
--- a/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt
+++ b/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt
@@ -62,7 +62,7 @@ generate g g > t t~ g g g
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.005532503128051758 [0m
+[1;32mDEBUG: model prefixing  takes 0.005609273910522461 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -155,35 +155,35 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=5: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ g g g WEIGHTED<=5 @1  
 INFO: Process has 1240 diagrams 
-1 processes with 1240 diagrams generated in 1.880 s
+1 processes with 1240 diagrams generated in 1.857 s
 Total: 1 processes with 1240 diagrams
 output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_ttggg
 Load PLUGIN.CUDACPP_OUTPUT
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1;32mDEBUG:  cformat = [0m plugin [1;30m[export_cpp.py at line 3071][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 155][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 160][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 160][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 165][0m [0m
 INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg 
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: g g > t t~ g g g WEIGHTED<=5 @1 
 INFO: Processing color information for process: g g > t t~ g g g @1 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [1;30m[output.py at line 189][0m [0m
-[1;32mDEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [1;30m[output.py at line 190][0m [0m
-[1;32mDEBUG:    type(fortran_model)=<class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 191][0m [0m
-[1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 192][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [1;30m[output.py at line 194][0m [0m
+[1;32mDEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [1;30m[output.py at line 195][0m [0m
+[1;32mDEBUG:    type(fortran_model)=<class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 196][0m [0m
+[1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 197][0m [0m
 INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/./CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/./CPPProcess.cc
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/. 
-Generated helas calls for 1 subprocesses (1240 diagrams) in 6.540 s
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 197][0m [0m
+Generated helas calls for 1 subprocesses (1240 diagrams) in 6.536 s
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 202][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 5 routines in  0.351 s
+ALOHA: aloha creates 5 routines in  0.341 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -204,9 +204,8 @@ FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/./Parameters_sm.cc
 INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory 
 INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/. 
-[1;32mDEBUG:  'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) = [0m Entering PLUGIN_ProcessExporter.finalize False <class 'PLUGIN.CUDACPP_OUTPUT.output.PLUGIN_ProcessExporter'> [1;30m[output.py at line 206][0m [0m
 quit
 
-real	0m15.959s
-user	0m12.810s
-sys	0m0.102s
+real	0m12.866s
+user	0m12.717s
+sys	0m0.100s
diff --git a/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt b/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt
index 89cb2749b0..97f5e25170 100644
--- a/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt
+++ b/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt
@@ -61,7 +61,7 @@ set zerowidth_tchannel F
 define q = u c d s u~ c~ d~ s~
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.0057373046875 [0m
+[1;32mDEBUG: model prefixing  takes 0.005373477935791016 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -170,17 +170,17 @@ INFO: Crossed process found for g u~ > t t~ u~, reuse diagrams.
 INFO: Crossed process found for g c~ > t t~ c~, reuse diagrams. 
 INFO: Crossed process found for g d~ > t t~ d~, reuse diagrams. 
 INFO: Crossed process found for g s~ > t t~ s~, reuse diagrams. 
-8 processes with 40 diagrams generated in 0.078 s
+8 processes with 40 diagrams generated in 0.077 s
 Total: 8 processes with 40 diagrams
-output madevent ../TMPOUT/CODEGEN_mad_gq_ttq --hel_recycling=False --vector_size=16384 --me_exporter=standalone_cudacpp
+output madevent ../TMPOUT/CODEGEN_mad_gq_ttq --hel_recycling=False --vector_size=32 --me_exporter=standalone_cudacpp
 Load PLUGIN.CUDACPP_OUTPUT
 [1mAddition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1;32mDEBUG:  cformat = [0m standalone_cudacpp [1;30m[export_cpp.py at line 3071][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 155][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 160][0m [0m
 INFO: initialize a new directory: CODEGEN_mad_gq_ttq 
 INFO: remove old information in CODEGEN_mad_gq_ttq 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 160][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 165][0m [0m
 [1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq [0m
 INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq 
 [1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards [0m
@@ -198,7 +198,7 @@ INFO: Combined process g d~ > t t~ d~ WEIGHTED<=3 @1 with process g u~ > t t~ u~
 INFO: Combined process g s~ > t t~ s~ WEIGHTED<=3 @1 with process g u~ > t t~ u~ WEIGHTED<=3 @1 
 INFO: Creating files in directory P1_gu_ttxu 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f395a95ddc0> [1;30m[export_v4.py at line 6262][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fc08deefc40> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -207,15 +207,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 [1;32mDEBUG:  config_map = [0m [1, 2, 3, 4, 5] [1;30m[export_cpp.py at line 711][0m [0m
 [1;32mDEBUG:  subproc_number = [0m 0 [1;30m[export_cpp.py at line 712][0m [0m
 [1;32mDEBUG:  Done [1;30m[export_cpp.py at line 713][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 32 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 32 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 32 True 32 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 32 True 32 [1;30m[export_v4.py at line 1872][0m [0m
 INFO: Generating Feynman diagrams for Process: g u > t t~ u WEIGHTED<=3 @1 
 INFO: Finding symmetric diagrams for subprocess group gu_ttxu 
 INFO: Creating files in directory P1_gux_ttxux 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f395aad79d0> [1;30m[export_v4.py at line 6262][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fc08dd9ac10> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -224,19 +224,19 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 [1;32mDEBUG:  config_map = [0m [1, 2, 3, 4, 5] [1;30m[export_cpp.py at line 711][0m [0m
 [1;32mDEBUG:  subproc_number = [0m 1 [1;30m[export_cpp.py at line 712][0m [0m
 [1;32mDEBUG:  Done [1;30m[export_cpp.py at line 713][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 32 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 32 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 32 True 32 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 32 True 32 [1;30m[export_v4.py at line 1872][0m [0m
 INFO: Generating Feynman diagrams for Process: g u~ > t t~ u~ WEIGHTED<=3 @1 
 INFO: Finding symmetric diagrams for subprocess group gux_ttxux 
 Generated helas calls for 2 subprocesses (10 diagrams) in 0.031 s
-Wrote files for 32 helas calls in 0.217 s
+Wrote files for 32 helas calls in 0.219 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates 2 routines in  0.144 s
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 197][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 202][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVV1 routines[0m
@@ -260,7 +260,6 @@ save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CO
 INFO: Use Fortran compiler gfortran 
 INFO: Use c++ compiler g++ 
 INFO: Generate web pages 
-[1;32mDEBUG:  'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) = [0m Entering PLUGIN_ProcessExporter.finalize True <class 'PLUGIN.CUDACPP_OUTPUT.output.PLUGIN_ProcessExporter'> [1;30m[output.py at line 206][0m [0m
 DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common
 patching file Source/genps.inc
 patching file Source/makefile
@@ -288,16 +287,16 @@ Hunk #2 succeeded at 162 (offset 19 lines).
 Hunk #3 succeeded at 247 (offset 26 lines).
 Hunk #4 succeeded at 281 (offset 32 lines).
 Hunk #5 succeeded at 326 (offset 32 lines).
-[1;32mDEBUG:  p.returncode = [0m 0 [1;30m[output.py at line 232][0m [0m
+[1;32mDEBUG:  p.returncode = [0m 0 [1;30m[output.py at line 237][0m [0m
 Output to directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq done.
 Type "launch" to generate events from this process, or see
 /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/README
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m4.915s
-user	0m1.680s
-sys	0m0.237s
+real	0m1.929s
+user	0m1.701s
+sys	0m0.227s
 ************************************************************
 *                                                          *
 *                      W E L C O M E to                    *
@@ -325,9 +324,6 @@ Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 No valid web browser found. Please set in ./input/mg5_configuration.txt
 treatcards run
-[1;34mrun_card missed argument cudacpp_backend. Takes default: CPP[0m
-[1;34mrun_card missed argument cudacpp_backend. Takes default: CPP[0m
-[1;34mWARNING! CPPRunCard instance has no attribute path[0m
 quit
 INFO:  
 launch in debug mode
diff --git a/epochX/cudacpp/gq_ttq.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/gq_ttq.mad/Cards/proc_card_mg5.dat
index efb0752a31..c49c39c3c4 100644
--- a/epochX/cudacpp/gq_ttq.mad/Cards/proc_card_mg5.dat
+++ b/epochX/cudacpp/gq_ttq.mad/Cards/proc_card_mg5.dat
@@ -48,4 +48,4 @@ define vl~ = ve~ vm~ vt~
 define q = u c d s u~ c~ d~ s~
 generate g q > t t~ q
 output madevent ../TMPOUT/CODEGEN_mad_gq_ttq --hel_recycling=False --v\
-ector_size=16384 --me_exporter=standalone_cudacpp
+ector_size=32 --me_exporter=standalone_cudacpp
diff --git a/epochX/cudacpp/gq_ttq.mad/Cards/run_card.dat b/epochX/cudacpp/gq_ttq.mad/Cards/run_card.dat
index 69e6fa2bf2..d987e643f0 100644
--- a/epochX/cudacpp/gq_ttq.mad/Cards/run_card.dat
+++ b/epochX/cudacpp/gq_ttq.mad/Cards/run_card.dat
@@ -82,7 +82,23 @@
    1  = sde_strategy  ! default integration strategy (hep-ph/2021.00773)
                              ! 1 is old strategy (using amp square)
 			     ! 2 is new strategy (using only the denominator)
-# To see advanced option for Phase-Space optimization: type "update psoptim"			     
+#*********************************************************************
+# Phase-Space Optim (advanced)
+#*********************************************************************
+   0 = job_strategy ! see appendix of 1507.00020 (page 26)
+   0 =  hard_survey ! force to have better estimate of the integral at survey for difficult mode like interference
+   -1.0 = tmin_for_channel ! limit the non-singular reach of --some-- channel of integration related to T-channel diagram (value between -1 and 0), -1 is no impact
+   -1 = survey_splitting ! for loop-induced control how many core are used at survey for the computation of a single iteration.
+   2 = survey_nchannel_per_job ! control how many Channel are integrated inside a single job on cluster/multicore
+   -1 = refine_evt_by_job ! control the maximal number of events for the first iteration of the refine (larger means less jobs)  
+#*********************************************************************
+# Compilation flag. 
+#*********************************************************************   
+   -O3 -ffast-math -fbounds-check = global_flag ! build flags for all Fortran code (for a fair comparison to cudacpp; default is -O)
+   --fast-math  = aloha_flag ! fortran optimization flag for aloha function. Suggestions: '-ffast-math'
+   -O3 = matrix_flag ! fortran optimization flag for matrix.f function. Suggestions: '-O3'
+   16384 = vector_size ! size of fortran arrays allocated in the multi-event API for SIMD/GPU (VECSIZE_MEMMAX)
+			     
 #*********************************************************************
 # Customization (custom cuts/scale/bias/...)                         *
 # list of files containing fortran function that overwrite default   *
@@ -145,12 +161,10 @@
 systematics = systematics_program ! none, systematics [python], SysCalc [depreceted, C++]
 ['--mur=0.5,1,2', '--muf=0.5,1,2', '--pdf=errorset'] = systematics_arguments ! see: https://cp3.irmp.ucl.ac.be/projects/madgraph/wiki/Systematics#Systematicspythonmodule
 
-#*********************************************************************
-# Options for the cudacpp plugin
-#*********************************************************************
-
-# Set cudacpp-specific values of non-cudacpp-specific options
--O3 -ffast-math -fbounds-check = global_flag ! build flags for Fortran code (for a fair comparison to cudacpp)
+#***********************************************************************
+# SIMD/GPU configuration for the CUDACPP plugin
+#************************************************************************
+ d = floating_type ! floating point precision: f (single), d (double), m (mixed: double for amplitudes, single for colors)
+ auto = avx_level ! SIMD vectorization level: none, sse4, avx2, 512y, 512z, auto
+ CPP = cudacpp_backend ! CUDACPP backend: FORTRAN, CPP, CUDA
 
-# New cudacpp-specific options (default values are defined in banner.py)
-CPP = cudacpp_backend ! valid backends are FORTRAN, CPP, CUDA
diff --git a/epochX/cudacpp/gq_ttq.mad/Cards/run_card_default.dat b/epochX/cudacpp/gq_ttq.mad/Cards/run_card_default.dat
index 4c99f39248..ef9e4b2707 100644
--- a/epochX/cudacpp/gq_ttq.mad/Cards/run_card_default.dat
+++ b/epochX/cudacpp/gq_ttq.mad/Cards/run_card_default.dat
@@ -82,7 +82,23 @@
    1  = sde_strategy  ! default integration strategy (hep-ph/2021.00773)
                              ! 1 is old strategy (using amp square)
 			     ! 2 is new strategy (using only the denominator)
-# To see advanced option for Phase-Space optimization: type "update psoptim"			     
+#*********************************************************************
+# Phase-Space Optim (advanced)
+#*********************************************************************
+   0 = job_strategy ! see appendix of 1507.00020 (page 26)
+   0 =  hard_survey ! force to have better estimate of the integral at survey for difficult mode like interference
+   -1.0 = tmin_for_channel ! limit the non-singular reach of --some-- channel of integration related to T-channel diagram (value between -1 and 0), -1 is no impact
+   -1 = survey_splitting ! for loop-induced control how many core are used at survey for the computation of a single iteration.
+   2 = survey_nchannel_per_job ! control how many Channel are integrated inside a single job on cluster/multicore
+   -1 = refine_evt_by_job ! control the maximal number of events for the first iteration of the refine (larger means less jobs)  
+#*********************************************************************
+# Compilation flag. 
+#*********************************************************************   
+   -O = global_flag ! fortran optimization flag use for the all code.
+   --fast-math  = aloha_flag ! fortran optimization flag for aloha function. Suggestions: '-ffast-math'
+   -O3 = matrix_flag ! fortran optimization flag for matrix.f function. Suggestions: '-O3'
+   16 = vector_size ! size of fortran arrays allocated in the multi-event API for SIMD/GPU (VECSIZE_MEMMAX)
+			     
 #*********************************************************************
 # Customization (custom cuts/scale/bias/...)                         *
 # list of files containing fortran function that overwrite default   *
@@ -144,3 +160,11 @@
 #
 systematics = systematics_program ! none, systematics [python], SysCalc [depreceted, C++]
 ['--mur=0.5,1,2', '--muf=0.5,1,2', '--pdf=errorset'] = systematics_arguments ! see: https://cp3.irmp.ucl.ac.be/projects/madgraph/wiki/Systematics#Systematicspythonmodule
+
+#***********************************************************************
+# SIMD/GPU configuration for the CUDACPP plugin
+#************************************************************************
+ d = floating_type ! floating point precision: f (single), d (double), m (mixed: double for amplitudes, single for colors)
+ auto = avx_level ! SIMD vectorization level: none, sse4, avx2, 512y, 512z, auto
+ CPP = cudacpp_backend ! CUDACPP backend: FORTRAN, CPP, CUDA
+
diff --git a/epochX/cudacpp/gq_ttq.mad/Source/make_opts b/epochX/cudacpp/gq_ttq.mad/Source/make_opts
index 6409e99a49..e4b87ee6ad 100644
--- a/epochX/cudacpp/gq_ttq.mad/Source/make_opts
+++ b/epochX/cudacpp/gq_ttq.mad/Source/make_opts
@@ -1,7 +1,6 @@
 DEFAULT_CPP_COMPILER=g++
 DEFAULT_F2PY_COMPILER=f2py3
 DEFAULT_F_COMPILER=gfortran
-#end_of_make_opts_variables
 GLOBAL_FLAG=-O3 -ffast-math -fbounds-check
 MACFLAG=
 MG5AMC_VERSION=SpecifiedByMG5aMCAtRunTime
diff --git a/epochX/cudacpp/gq_ttq.mad/bin/internal/banner.py b/epochX/cudacpp/gq_ttq.mad/bin/internal/banner.py
index 3995ce8109..bd1517985f 100755
--- a/epochX/cudacpp/gq_ttq.mad/bin/internal/banner.py
+++ b/epochX/cudacpp/gq_ttq.mad/bin/internal/banner.py
@@ -2822,13 +2822,15 @@ def add_param(self, name, value, fortran_name=None, include=True,
         if fct_mod:
             self.fct_mod[name] = fct_mod
 
-    def read(self, finput, consistency=True, unknown_warning=True):
+    def read(self, finput, consistency=True, unknown_warning=True, **opt):
         """Read the input file, this can be a path to a file, 
            a file object, a str with the content of the file."""
            
         if isinstance(finput, str):
             if "\n" in finput:
                 finput = finput.split('\n')
+                if 'path' in opt:
+                    self.path = opt['path']
             elif os.path.isfile(finput):
                 self.path = finput
                 finput = open(finput)
@@ -3913,7 +3915,7 @@ def remove_all_cut(self):
    %(global_flag)s = global_flag ! fortran optimization flag use for the all code.
    %(aloha_flag)s  = aloha_flag ! fortran optimization flag for aloha function. Suggestions: '-ffast-math'
    %(matrix_flag)s = matrix_flag ! fortran optimization flag for matrix.f function. Suggestions: '-O3'
-   %(vector_size)s = vector_size ! size designed for SIMD/OpenMP/GPU (number of events in lockstep)
+   %(vector_size)s = vector_size ! size of fortran arrays allocated in the multi-event API for SIMD/GPU (VECSIZE_MEMMAX)
 """
 
 template_off = '# To see advanced option for Phase-Space optimization: type "update psoptim"'
diff --git a/epochX/cudacpp/gq_ttq.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/gq_ttq.mad/bin/internal/launch_plugin.py
index 0b849330ef..3b09713e12 100644
--- a/epochX/cudacpp/gq_ttq.mad/bin/internal/launch_plugin.py
+++ b/epochX/cudacpp/gq_ttq.mad/bin/internal/launch_plugin.py
@@ -16,22 +16,28 @@
     import internal.misc as misc
     import internal.extended_cmd as extended_cmd
     import internal.banner as banner_mod
+    import internal.common_run_interface as common_run_interface
 else:
     import madgraph.interface.madevent_interface as madevent_interface
     import madgraph.various.misc as misc
     import madgraph.interface.extended_cmd as extended_cmd
     import madgraph.various.banner as banner_mod
+    import madgraph.interface.common_run_interface as common_run_interface
 
 class CPPMEInterface(madevent_interface.MadEventCmdShell):
     def compile(self, *args, **opts):
         """ """
         import multiprocessing
         if not self.options['nb_core'] or self.options['nb_core'] == 'None':
-            self.options['nb_core'] = multiprocessing.cpu_count()
-        if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'):
-            import pathlib
-            import os
-            pjoin = os.path.join
+            self.options['nb_core'] = multiprocessing.cpu_count()    
+        if 'cwd' in opts and os.path.basename(opts['cwd']) == 'Source':
+            path = pjoin(opts['cwd'], 'make_opts')
+            avx_level = self.run_card['avx_level'] if self.run_card['avx_level'] != 'auto' else ''
+            common_run_interface.CommonRunCmd.update_make_opts_full(path,
+                {'FPTYPE': self.run_card['floating_type'],
+                 'AVX': avx_level })
+            misc.sprint('FPTYPE checked')
+        if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'):            
             cudacpp_backend = self.run_card['cudacpp_backend'].upper() # the default value is defined in banner.py
             logger.info("Building madevent in madevent_interface.py with '%s' matrix elements"%cudacpp_backend)
             if cudacpp_backend == 'FORTRAN':
@@ -46,30 +52,83 @@ def compile(self, *args, **opts):
         else:
             return misc.compile(nb_core=self.options['nb_core'], *args, **opts)
 
+# Phase-Space Optimization ------------------------------------------------------------------------------------
+template_on = \
+"""#***********************************************************************
+# SIMD/GPU configuration for the CUDACPP plugin
+#************************************************************************
+ %(floating_type)s = floating_type ! floating point precision: f (single), d (double), m (mixed: double for amplitudes, single for colors)
+ %(avx_level)s = avx_level ! SIMD vectorization level: none, sse4, avx2, 512y, 512z, auto
+ %(cudacpp_backend)s = cudacpp_backend ! CUDACPP backend: FORTRAN, CPP, CUDA
+"""
+
+template_off = ''
+plugin_block = banner_mod.RunBlock('simd', template_on=template_on, template_off=template_off)
+
 class CPPRunCard(banner_mod.RunCardLO):
+    blocks = banner_mod.RunCardLO.blocks + [plugin_block]
+
     def reset_simd(self, old_value, new_value, name):
         if not hasattr(self, 'path'):
-            logger.warning('WARNING! CPPRunCard instance has no attribute path')
-            return
-            ###raise Exception('INTERNAL ERROR! CPPRunCard instance has no attribute path')
+            raise Exception('INTERNAL ERROR! CPPRunCard instance has no attribute path') # now ok after fixing #790
         if name == "vector_size" and new_value <= int(old_value):
             # code can handle the new size -> do not recompile
             return
         Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source')
         subprocess.call(['make', 'cleanavx'], cwd=Sourcedir, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
 
+    def reset_makeopts(self, old_value, new_value, name):
+        if not hasattr(self, 'path'):
+            raise Exception
+        avx_value = self['avx_level'] if self['avx_level'] != 'auto' else ''
+        if name == 'floating_type':
+            common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': new_value, 'AVX': avx_value})
+        elif name == 'avx_level':
+            if new_value == 'auto': new_value = ''
+            common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': self['floating_type'], 'AVX': new_value})
+        else:
+            raise Exception
+        Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source')
+        subprocess.call(['make', 'cleanavx'], cwd=Sourcedir, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+
     def plugin_input(self, finput):
         return
 
     def default_setup(self):
         super().default_setup()
-        self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False)
+        self.add_param('floating_type', 'd', include=False, hidden=False,
+                       fct_mod=(self.reset_makeopts,(),{}),
+                       allowed=['m','d','f'])
+        self.add_param('avx_level', 'auto', include=False, hidden=False,
+                       fct_mod=(self.reset_makeopts,(),{}),
+                       allowed=['auto', 'none', 'sse4', 'avx2','512y','512z'])
+        self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False,
+                       allowed=['Fortan', 'CPP', 'CUDA'])
+        self['vector_size'] = 16 # already setup in default class (just change value)
+        self['aloha_flag'] = '--fast-math'
+        self['matrix_flag'] = '-O3'
+        self.display_block.append('simd')
+        self.display_block.append('psoptim')
 
+    # OM/AV - overload the default version in banner.py
     def write_one_include_file(self, output_dir, incname, output_file=None):
         """write one include file at the time"""
-        if incname == "vector.inc" and 'vector_size' not in self.user_set:
-            return
-        super().write_one_include_file(output_dir, incname, output_file)
+        if incname == "vector.inc":
+            if 'vector_size' not in self.user_set: return
+            if output_file is None: vectorinc=pjoin(output_dir,incname)
+            else: vectorinc=output_file
+            with open(vectorinc+'.new','w') as fileout:
+                with open(vectorinc) as filein:
+                    for line in filein:
+                        if line.startswith('C'): fileout.write(line)
+            super().write_one_include_file(output_dir, incname, output_file)
+            with open(vectorinc+'.new','a') as fileout:
+                with open(vectorinc) as filein:
+                    for line in filein:
+                        if not line.startswith('\n'): fileout.write(line)
+            os.replace(vectorinc+'.new',vectorinc)
+        else:
+            super().write_one_include_file(output_dir, incname, output_file)
 
     def check_validity(self):
         """ensure that PLUGIN information are consistent"""
@@ -82,13 +141,10 @@ def check_validity(self):
 
 class GPURunCard(CPPRunCard):
     def default_setup(self):
-        super(CPPRunCard, self).default_setup()
-        self.add_param('cudacpp_backend', 'CUDA', include=False, hidden=False)
-
-#class CUDACPPRunCard(CPPRunCard):
-#    def default_setup(self):
-#        super(CPPRunCard, self).default_setup()
-#        self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False)
+        super().default_setup()
+        # change default value:
+        self['cudacpp_backend'] = 'CUDA'
+        self['vector_size'] = 16384 # already setup in default class (just change value)
 
 MEINTERFACE = CPPMEInterface
 RunCard = CPPRunCard
diff --git a/epochX/cudacpp/gq_ttq.mad/bin/internal/madevent_interface.py b/epochX/cudacpp/gq_ttq.mad/bin/internal/madevent_interface.py
index 853aabc98a..cb6bf4ca57 100755
--- a/epochX/cudacpp/gq_ttq.mad/bin/internal/madevent_interface.py
+++ b/epochX/cudacpp/gq_ttq.mad/bin/internal/madevent_interface.py
@@ -3191,7 +3191,7 @@ def do_treatcards(self, line, mode=None, opt=None):
       
         if mode in ['run', 'all']:
             if not hasattr(self, 'run_card'):
-                run_card = banner_mod.RunCard(opt['run_card'])
+                run_card = banner_mod.RunCard(opt['run_card'], path=pjoin(self.me_dir, 'Cards', 'run_card.dat'))
             else:
                 run_card = self.run_card
             self.run_card = run_card
diff --git a/epochX/cudacpp/gq_ttq.mad/mg5.in b/epochX/cudacpp/gq_ttq.mad/mg5.in
index e93843b8cd..2273ae9cfd 100644
--- a/epochX/cudacpp/gq_ttq.mad/mg5.in
+++ b/epochX/cudacpp/gq_ttq.mad/mg5.in
@@ -2,4 +2,4 @@ set stdout_level DEBUG
 set zerowidth_tchannel F
 define q = u c d s u~ c~ d~ s~
 generate g q > t t~ q
-output madevent gq_ttq.mad --hel_recycling=False --vector_size=16384 --me_exporter=standalone_cudacpp
+output madevent gq_ttq.mad --hel_recycling=False --vector_size=32 --me_exporter=standalone_cudacpp
diff --git a/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt b/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt
index 16374bd28e..91509797eb 100644
--- a/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt
+++ b/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt
@@ -61,7 +61,7 @@ set zerowidth_tchannel F
 define q = u c d s u~ c~ d~ s~
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.005791902542114258 [0m
+[1;32mDEBUG: model prefixing  takes 0.005265951156616211 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -170,14 +170,14 @@ INFO: Crossed process found for g u~ > t t~ u~, reuse diagrams.
 INFO: Crossed process found for g c~ > t t~ c~, reuse diagrams. 
 INFO: Crossed process found for g d~ > t t~ d~, reuse diagrams. 
 INFO: Crossed process found for g s~ > t t~ s~, reuse diagrams. 
-8 processes with 40 diagrams generated in 0.078 s
+8 processes with 40 diagrams generated in 0.077 s
 Total: 8 processes with 40 diagrams
 output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gq_ttq
 Load PLUGIN.CUDACPP_OUTPUT
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1;32mDEBUG:  cformat = [0m plugin [1;30m[export_cpp.py at line 3071][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 155][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 160][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 160][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 165][0m [0m
 INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq 
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: g u > t t~ u WEIGHTED<=3 @1 
@@ -190,28 +190,28 @@ INFO: Processing color information for process: g u~ > t t~ u~ @1
 INFO: Combined process g c~ > t t~ c~ WEIGHTED<=3 @1 with process g u~ > t t~ u~ WEIGHTED<=3 @1 
 INFO: Combined process g d~ > t t~ d~ WEIGHTED<=3 @1 with process g u~ > t t~ u~ WEIGHTED<=3 @1 
 INFO: Combined process g s~ > t t~ s~ WEIGHTED<=3 @1 with process g u~ > t t~ u~ WEIGHTED<=3 @1 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [1;30m[output.py at line 189][0m [0m
-[1;32mDEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [1;30m[output.py at line 190][0m [0m
-[1;32mDEBUG:    type(fortran_model)=<class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 191][0m [0m
-[1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 192][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [1;30m[output.py at line 194][0m [0m
+[1;32mDEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [1;30m[output.py at line 195][0m [0m
+[1;32mDEBUG:    type(fortran_model)=<class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 196][0m [0m
+[1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 197][0m [0m
 INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu/./CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu/./CPPProcess.cc
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu/. 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [1;30m[output.py at line 189][0m [0m
-[1;32mDEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [1;30m[output.py at line 190][0m [0m
-[1;32mDEBUG:    type(fortran_model)=<class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 191][0m [0m
-[1;32mDEBUG:    type(me)=<class 'int'> me=1 [1;30m[output.py at line 192][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [1;30m[output.py at line 194][0m [0m
+[1;32mDEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [1;30m[output.py at line 195][0m [0m
+[1;32mDEBUG:    type(fortran_model)=<class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 196][0m [0m
+[1;32mDEBUG:    type(me)=<class 'int'> me=1 [1;30m[output.py at line 197][0m [0m
 INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux/./CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux/./CPPProcess.cc
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux/. 
 Generated helas calls for 2 subprocesses (10 diagrams) in 0.030 s
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 197][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 202][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVV1 routines[0m
-ALOHA: aloha creates 2 routines in  0.144 s
+ALOHA: aloha creates 2 routines in  0.141 s
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -225,9 +225,8 @@ FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/./Parameters_sm.cc
 INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory 
 INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/. 
-[1;32mDEBUG:  'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) = [0m Entering PLUGIN_ProcessExporter.finalize False <class 'PLUGIN.CUDACPP_OUTPUT.output.PLUGIN_ProcessExporter'> [1;30m[output.py at line 206][0m [0m
 quit
 
-real	0m3.656s
-user	0m0.594s
-sys	0m0.059s
+real	0m0.640s
+user	0m0.574s
+sys	0m0.061s
diff --git a/epochX/cudacpp/heft_gg_h.sa/CODEGEN_cudacpp_heft_gg_h_log.txt b/epochX/cudacpp/heft_gg_h.sa/CODEGEN_cudacpp_heft_gg_h_log.txt
index 3b04fc3fb3..452dbff73e 100644
--- a/epochX/cudacpp/heft_gg_h.sa/CODEGEN_cudacpp_heft_gg_h_log.txt
+++ b/epochX/cudacpp/heft_gg_h.sa/CODEGEN_cudacpp_heft_gg_h_log.txt
@@ -135,25 +135,25 @@ output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_heft_gg_h
 Load PLUGIN.CUDACPP_OUTPUT
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1;32mDEBUG:  cformat = [0m plugin [1;30m[export_cpp.py at line 3071][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 155][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 160][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 160][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 165][0m [0m
 INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h 
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: g g > h HIG<=1 HIW<=1 WEIGHTED<=2 @1 
 INFO: Processing color information for process: g g > h HIG<=1 HIW<=1 @1 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [1;30m[output.py at line 189][0m [0m
-[1;32mDEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [1;30m[output.py at line 190][0m [0m
-[1;32mDEBUG:    type(fortran_model)=<class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 191][0m [0m
-[1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 192][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [1;30m[output.py at line 194][0m [0m
+[1;32mDEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [1;30m[output.py at line 195][0m [0m
+[1;32mDEBUG:    type(fortran_model)=<class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 196][0m [0m
+[1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 197][0m [0m
 INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h/SubProcesses/P1_Sigma_heft_gg_h 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h/SubProcesses/P1_Sigma_heft_gg_h/./CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h/SubProcesses/P1_Sigma_heft_gg_h/./CPPProcess.cc
 INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h/SubProcesses/P1_Sigma_heft_gg_h/. 
 Generated helas calls for 1 subprocesses (1 diagrams) in 0.002 s
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 197][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 202][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVS3 routines[0m
-ALOHA: aloha creates 1 routines in  0.062 s
+ALOHA: aloha creates 1 routines in  0.060 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVS3
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h/src/./HelAmps_heft.h
 INFO: Created file HelAmps_heft.h in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h/src/. 
@@ -163,9 +163,8 @@ FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h/src/./Parameters_heft.cc
 INFO: Created files Parameters_heft.h and Parameters_heft.cc in directory 
 INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h/src/. 
-[1;32mDEBUG:  'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) = [0m Entering PLUGIN_ProcessExporter.finalize False <class 'PLUGIN.CUDACPP_OUTPUT.output.PLUGIN_ProcessExporter'> [1;30m[output.py at line 206][0m [0m
 quit
 
-real	0m3.422s
-user	0m0.371s
-sys	0m0.048s
+real	0m0.415s
+user	0m0.358s
+sys	0m0.050s
diff --git a/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt b/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt
index 8b6ca99446..e73dd42300 100644
--- a/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt
+++ b/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt
@@ -61,7 +61,7 @@ set zerowidth_tchannel F
 define j = p
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.00538325309753418 [0m
+[1;32mDEBUG: model prefixing  takes 0.0053882598876953125 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -172,7 +172,7 @@ INFO: Process u~ u > t t~ added to mirror process u u~ > t t~
 INFO: Process c~ c > t t~ added to mirror process c c~ > t t~ 
 INFO: Process d~ d > t t~ added to mirror process d d~ > t t~ 
 INFO: Process s~ s > t t~ added to mirror process s s~ > t t~ 
-5 processes with 7 diagrams generated in 0.029 s
+5 processes with 7 diagrams generated in 0.031 s
 Total: 5 processes with 7 diagrams
 add process p p > t t~ j @1
 INFO: Checking for minimal orders which gives processes. 
@@ -212,7 +212,7 @@ INFO: Process d~ g > t t~ d~ added to mirror process g d~ > t t~ d~
 INFO: Process d~ d > t t~ g added to mirror process d d~ > t t~ g 
 INFO: Process s~ g > t t~ s~ added to mirror process g s~ > t t~ s~ 
 INFO: Process s~ s > t t~ g added to mirror process s s~ > t t~ g 
-13 processes with 76 diagrams generated in 0.134 s
+13 processes with 76 diagrams generated in 0.141 s
 Total: 18 processes with 83 diagrams
 add process p p > t t~ j j @2
 INFO: Checking for minimal orders which gives processes. 
@@ -378,17 +378,17 @@ INFO: Process s~ u~ > t t~ u~ s~ added to mirror process u~ s~ > t t~ u~ s~
 INFO: Process s~ c~ > t t~ c~ s~ added to mirror process c~ s~ > t t~ c~ s~ 
 INFO: Process s~ d~ > t t~ d~ s~ added to mirror process d~ s~ > t t~ d~ s~ 
 INFO: Crossed process found for s~ s~ > t t~ s~ s~, reuse diagrams. 
-65 processes with 1119 diagrams generated in 1.811 s
+65 processes with 1119 diagrams generated in 1.922 s
 Total: 83 processes with 1202 diagrams
-output madevent ../TMPOUT/CODEGEN_mad_pp_tt012j --hel_recycling=False --vector_size=16384 --me_exporter=standalone_cudacpp
+output madevent ../TMPOUT/CODEGEN_mad_pp_tt012j --hel_recycling=False --vector_size=32 --me_exporter=standalone_cudacpp
 Load PLUGIN.CUDACPP_OUTPUT
 [1mAddition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1;32mDEBUG:  cformat = [0m standalone_cudacpp [1;30m[export_cpp.py at line 3071][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 155][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 160][0m [0m
 INFO: initialize a new directory: CODEGEN_mad_pp_tt012j 
 INFO: remove old information in CODEGEN_mad_pp_tt012j 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 160][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 165][0m [0m
 [1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j [0m
 INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j 
 [1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards [0m
@@ -497,7 +497,7 @@ INFO: Combined process d d~ > t t~ WEIGHTED<=2 with process u u~ > t t~ WEIGHTED
 INFO: Combined process s s~ > t t~ WEIGHTED<=2 with process u u~ > t t~ WEIGHTED<=2 
 INFO: Creating files in directory P2_gg_ttxgg 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f1b1e4ed4c0> [1;30m[export_v4.py at line 6262][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f33336d4c40> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -506,15 +506,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 [1;32mDEBUG:  config_map = [0m [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 0, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 0, 46, 47, 48, 49, 50, 51, 52, 53, 54, 0, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 0, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 0, 88, 89, 90, 91, 92, 93, 0, 94, 95, 96, 97, 98, 99, 0, 100, 101, 102, 103, 104, 105, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] [1;30m[export_cpp.py at line 711][0m [0m
 [1;32mDEBUG:  subproc_number = [0m 0 [1;30m[export_cpp.py at line 712][0m [0m
 [1;32mDEBUG:  Done [1;30m[export_cpp.py at line 713][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 32 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 32 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 32 True 32 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 32 True 32 [1;30m[export_v4.py at line 1872][0m [0m
 INFO: Generating Feynman diagrams for Process: g g > t t~ g g WEIGHTED<=4 @2 
 INFO: Finding symmetric diagrams for subprocess group gg_ttxgg 
 INFO: Creating files in directory P2_gg_ttxuux 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f1b1e4ea100> [1;30m[export_v4.py at line 6262][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f33336d4430> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -523,15 +523,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 [1;32mDEBUG:  config_map = [0m [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 0, 34, 35] [1;30m[export_cpp.py at line 711][0m [0m
 [1;32mDEBUG:  subproc_number = [0m 1 [1;30m[export_cpp.py at line 712][0m [0m
 [1;32mDEBUG:  Done [1;30m[export_cpp.py at line 713][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 32 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 32 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 32 True 32 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 32 True 32 [1;30m[export_v4.py at line 1872][0m [0m
 INFO: Generating Feynman diagrams for Process: g g > t t~ u u~ WEIGHTED<=4 @2 
 INFO: Finding symmetric diagrams for subprocess group gg_ttxuux 
 INFO: Creating files in directory P2_gu_ttxgu 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f1b1e4ea100> [1;30m[export_v4.py at line 6262][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f3333a1deb0> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -540,15 +540,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 [1;32mDEBUG:  config_map = [0m [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 0, 34, 35] [1;30m[export_cpp.py at line 711][0m [0m
 [1;32mDEBUG:  subproc_number = [0m 2 [1;30m[export_cpp.py at line 712][0m [0m
 [1;32mDEBUG:  Done [1;30m[export_cpp.py at line 713][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 32 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 32 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 32 True 32 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 32 True 32 [1;30m[export_v4.py at line 1872][0m [0m
 INFO: Generating Feynman diagrams for Process: g u > t t~ g u WEIGHTED<=4 @2 
 INFO: Finding symmetric diagrams for subprocess group gu_ttxgu 
 INFO: Creating files in directory P2_gux_ttxgux 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f1b1e7e7670> [1;30m[export_v4.py at line 6262][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f333422a1c0> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -557,15 +557,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 [1;32mDEBUG:  config_map = [0m [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 0, 34, 35] [1;30m[export_cpp.py at line 711][0m [0m
 [1;32mDEBUG:  subproc_number = [0m 3 [1;30m[export_cpp.py at line 712][0m [0m
 [1;32mDEBUG:  Done [1;30m[export_cpp.py at line 713][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 32 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 32 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 32 True 32 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 32 True 32 [1;30m[export_v4.py at line 1872][0m [0m
 INFO: Generating Feynman diagrams for Process: g u~ > t t~ g u~ WEIGHTED<=4 @2 
 INFO: Finding symmetric diagrams for subprocess group gux_ttxgux 
 INFO: Creating files in directory P2_uux_ttxgg 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f1b1e7e7670> [1;30m[export_v4.py at line 6262][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f3333864250> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -574,15 +574,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 [1;32mDEBUG:  config_map = [0m [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 0, 34, 35] [1;30m[export_cpp.py at line 711][0m [0m
 [1;32mDEBUG:  subproc_number = [0m 4 [1;30m[export_cpp.py at line 712][0m [0m
 [1;32mDEBUG:  Done [1;30m[export_cpp.py at line 713][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 32 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 32 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 32 True 32 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 32 True 32 [1;30m[export_v4.py at line 1872][0m [0m
 INFO: Generating Feynman diagrams for Process: u u~ > t t~ g g WEIGHTED<=4 @2 
 INFO: Finding symmetric diagrams for subprocess group uux_ttxgg 
 INFO: Creating files in directory P1_gg_ttxg 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f1b1f047160> [1;30m[export_v4.py at line 6262][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f3333864250> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -591,15 +591,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 [1;32mDEBUG:  config_map = [0m [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0] [1;30m[export_cpp.py at line 711][0m [0m
 [1;32mDEBUG:  subproc_number = [0m 5 [1;30m[export_cpp.py at line 712][0m [0m
 [1;32mDEBUG:  Done [1;30m[export_cpp.py at line 713][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 32 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 32 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 32 True 32 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 32 True 32 [1;30m[export_v4.py at line 1872][0m [0m
 INFO: Generating Feynman diagrams for Process: g g > t t~ g WEIGHTED<=3 @1 
 INFO: Finding symmetric diagrams for subprocess group gg_ttxg 
 INFO: Creating files in directory P2_uu_ttxuu 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f1b1e793a00> [1;30m[export_v4.py at line 6262][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f3333a24ca0> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -608,15 +608,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 [1;32mDEBUG:  config_map = [0m [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14] [1;30m[export_cpp.py at line 711][0m [0m
 [1;32mDEBUG:  subproc_number = [0m 6 [1;30m[export_cpp.py at line 712][0m [0m
 [1;32mDEBUG:  Done [1;30m[export_cpp.py at line 713][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 32 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 32 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 32 True 32 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 32 True 32 [1;30m[export_v4.py at line 1872][0m [0m
 INFO: Generating Feynman diagrams for Process: u u > t t~ u u WEIGHTED<=4 @2 
 INFO: Finding symmetric diagrams for subprocess group uu_ttxuu 
 INFO: Creating files in directory P2_uux_ttxuux 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f1b1e4ea100> [1;30m[export_v4.py at line 6262][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f33339e5be0> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -625,15 +625,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 [1;32mDEBUG:  config_map = [0m [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14] [1;30m[export_cpp.py at line 711][0m [0m
 [1;32mDEBUG:  subproc_number = [0m 7 [1;30m[export_cpp.py at line 712][0m [0m
 [1;32mDEBUG:  Done [1;30m[export_cpp.py at line 713][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 32 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 32 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 32 True 32 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 32 True 32 [1;30m[export_v4.py at line 1872][0m [0m
 INFO: Generating Feynman diagrams for Process: u u~ > t t~ u u~ WEIGHTED<=4 @2 
 INFO: Finding symmetric diagrams for subprocess group uux_ttxuux 
 INFO: Creating files in directory P2_uxux_ttxuxux 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f1b1e4ea100> [1;30m[export_v4.py at line 6262][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f3333ee8f70> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -642,15 +642,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 [1;32mDEBUG:  config_map = [0m [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14] [1;30m[export_cpp.py at line 711][0m [0m
 [1;32mDEBUG:  subproc_number = [0m 8 [1;30m[export_cpp.py at line 712][0m [0m
 [1;32mDEBUG:  Done [1;30m[export_cpp.py at line 713][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 32 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 32 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 32 True 32 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 32 True 32 [1;30m[export_v4.py at line 1872][0m [0m
 INFO: Generating Feynman diagrams for Process: u~ u~ > t t~ u~ u~ WEIGHTED<=4 @2 
 INFO: Finding symmetric diagrams for subprocess group uxux_ttxuxux 
 INFO: Creating files in directory P2_uc_ttxuc 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f1b1e4ea100> [1;30m[export_v4.py at line 6262][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f3333d6d850> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -659,15 +659,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 [1;32mDEBUG:  config_map = [0m [1, 2, 3, 4, 5, 6, 7] [1;30m[export_cpp.py at line 711][0m [0m
 [1;32mDEBUG:  subproc_number = [0m 9 [1;30m[export_cpp.py at line 712][0m [0m
 [1;32mDEBUG:  Done [1;30m[export_cpp.py at line 713][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 32 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 32 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 32 True 32 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 32 True 32 [1;30m[export_v4.py at line 1872][0m [0m
 INFO: Generating Feynman diagrams for Process: u c > t t~ u c WEIGHTED<=4 @2 
 INFO: Finding symmetric diagrams for subprocess group uc_ttxuc 
 INFO: Creating files in directory P2_uux_ttxccx 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f1b1e4ed4c0> [1;30m[export_v4.py at line 6262][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f3333a1dd60> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -676,15 +676,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 [1;32mDEBUG:  config_map = [0m [1, 2, 3, 4, 5, 6, 7] [1;30m[export_cpp.py at line 711][0m [0m
 [1;32mDEBUG:  subproc_number = [0m 10 [1;30m[export_cpp.py at line 712][0m [0m
 [1;32mDEBUG:  Done [1;30m[export_cpp.py at line 713][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 32 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 32 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 32 True 32 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 32 True 32 [1;30m[export_v4.py at line 1872][0m [0m
 INFO: Generating Feynman diagrams for Process: u u~ > t t~ c c~ WEIGHTED<=4 @2 
 INFO: Finding symmetric diagrams for subprocess group uux_ttxccx 
 INFO: Creating files in directory P2_ucx_ttxucx 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f1b1f047160> [1;30m[export_v4.py at line 6262][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f33339c4d30> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -693,15 +693,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 [1;32mDEBUG:  config_map = [0m [1, 2, 3, 4, 5, 6, 7] [1;30m[export_cpp.py at line 711][0m [0m
 [1;32mDEBUG:  subproc_number = [0m 11 [1;30m[export_cpp.py at line 712][0m [0m
 [1;32mDEBUG:  Done [1;30m[export_cpp.py at line 713][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 32 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 32 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 32 True 32 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 32 True 32 [1;30m[export_v4.py at line 1872][0m [0m
 INFO: Generating Feynman diagrams for Process: u c~ > t t~ u c~ WEIGHTED<=4 @2 
 INFO: Finding symmetric diagrams for subprocess group ucx_ttxucx 
 INFO: Creating files in directory P2_uxcx_ttxuxcx 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f1b1e4ea100> [1;30m[export_v4.py at line 6262][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f3333d6d850> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -710,15 +710,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 [1;32mDEBUG:  config_map = [0m [1, 2, 3, 4, 5, 6, 7] [1;30m[export_cpp.py at line 711][0m [0m
 [1;32mDEBUG:  subproc_number = [0m 12 [1;30m[export_cpp.py at line 712][0m [0m
 [1;32mDEBUG:  Done [1;30m[export_cpp.py at line 713][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 32 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 32 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 32 True 32 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 32 True 32 [1;30m[export_v4.py at line 1872][0m [0m
 INFO: Generating Feynman diagrams for Process: u~ c~ > t t~ u~ c~ WEIGHTED<=4 @2 
 INFO: Finding symmetric diagrams for subprocess group uxcx_ttxuxcx 
 INFO: Creating files in directory P1_gu_ttxu 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f1b1e4ea100> [1;30m[export_v4.py at line 6262][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f3333d6d850> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -727,15 +727,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 [1;32mDEBUG:  config_map = [0m [1, 2, 3, 4, 5] [1;30m[export_cpp.py at line 711][0m [0m
 [1;32mDEBUG:  subproc_number = [0m 13 [1;30m[export_cpp.py at line 712][0m [0m
 [1;32mDEBUG:  Done [1;30m[export_cpp.py at line 713][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 32 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 32 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 32 True 32 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 32 True 32 [1;30m[export_v4.py at line 1872][0m [0m
 INFO: Generating Feynman diagrams for Process: g u > t t~ u WEIGHTED<=3 @1 
 INFO: Finding symmetric diagrams for subprocess group gu_ttxu 
 INFO: Creating files in directory P1_gux_ttxux 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f1b1e7ddac0> [1;30m[export_v4.py at line 6262][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f3333d6d850> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -744,15 +744,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 [1;32mDEBUG:  config_map = [0m [1, 2, 3, 4, 5] [1;30m[export_cpp.py at line 711][0m [0m
 [1;32mDEBUG:  subproc_number = [0m 14 [1;30m[export_cpp.py at line 712][0m [0m
 [1;32mDEBUG:  Done [1;30m[export_cpp.py at line 713][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 32 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 32 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 32 True 32 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 32 True 32 [1;30m[export_v4.py at line 1872][0m [0m
 INFO: Generating Feynman diagrams for Process: g u~ > t t~ u~ WEIGHTED<=3 @1 
 INFO: Finding symmetric diagrams for subprocess group gux_ttxux 
 INFO: Creating files in directory P1_uux_ttxg 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f1b1e7a9640> [1;30m[export_v4.py at line 6262][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f33339e3a00> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -761,15 +761,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 [1;32mDEBUG:  config_map = [0m [1, 2, 3, 4, 5] [1;30m[export_cpp.py at line 711][0m [0m
 [1;32mDEBUG:  subproc_number = [0m 15 [1;30m[export_cpp.py at line 712][0m [0m
 [1;32mDEBUG:  Done [1;30m[export_cpp.py at line 713][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 32 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 32 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 32 True 32 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 32 True 32 [1;30m[export_v4.py at line 1872][0m [0m
 INFO: Generating Feynman diagrams for Process: u u~ > t t~ g WEIGHTED<=3 @1 
 INFO: Finding symmetric diagrams for subprocess group uux_ttxg 
 INFO: Creating files in directory P0_gg_ttx 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f1b1e79d2e0> [1;30m[export_v4.py at line 6262][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f3333a1dd60> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -778,15 +778,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 [1;32mDEBUG:  config_map = [0m [1, 2, 3] [1;30m[export_cpp.py at line 711][0m [0m
 [1;32mDEBUG:  subproc_number = [0m 16 [1;30m[export_cpp.py at line 712][0m [0m
 [1;32mDEBUG:  Done [1;30m[export_cpp.py at line 713][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 32 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 32 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 32 True 32 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 32 True 32 [1;30m[export_v4.py at line 1872][0m [0m
 INFO: Generating Feynman diagrams for Process: g g > t t~ WEIGHTED<=2 
 INFO: Finding symmetric diagrams for subprocess group gg_ttx 
 INFO: Creating files in directory P0_uux_ttx 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1058][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f1b1eb8eee0> [1;30m[export_v4.py at line 6262][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f33336d4c40> [1;30m[export_v4.py at line 6262][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -795,29 +795,29 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 [1;32mDEBUG:  config_map = [0m [1] [1;30m[export_cpp.py at line 711][0m [0m
 [1;32mDEBUG:  subproc_number = [0m 17 [1;30m[export_cpp.py at line 712][0m [0m
 [1;32mDEBUG:  Done [1;30m[export_cpp.py at line 713][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 16384 [1;30m[export_v4.py at line 1872][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
-[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 16384 True 16384 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 32 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m False True 32 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 32 True 32 [1;30m[export_v4.py at line 1872][0m [0m
+[1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 32 True 32 [1;30m[export_v4.py at line 1872][0m [0m
 INFO: Generating Feynman diagrams for Process: u u~ > t t~ WEIGHTED<=2 
 INFO: Finding symmetric diagrams for subprocess group uux_ttx 
-Generated helas calls for 18 subprocesses (372 diagrams) in 1.267 s
-Wrote files for 810 helas calls in 3.215 s
+Generated helas calls for 18 subprocesses (372 diagrams) in 1.279 s
+Wrote files for 810 helas calls in 3.193 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 5 routines in  0.333 s
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 197][0m [0m
+ALOHA: aloha creates 5 routines in  0.331 s
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 202][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 10 routines in  0.312 s
+ALOHA: aloha creates 10 routines in  0.310 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -844,7 +844,6 @@ save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CO
 INFO: Use Fortran compiler gfortran 
 INFO: Use c++ compiler g++ 
 INFO: Generate web pages 
-[1;32mDEBUG:  'Entering PLUGIN_ProcessExporter.finalize', self.in_madevent_mode, type(self) = [0m Entering PLUGIN_ProcessExporter.finalize True <class 'PLUGIN.CUDACPP_OUTPUT.output.PLUGIN_ProcessExporter'> [1;30m[output.py at line 206][0m [0m
 DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common
 patching file Source/genps.inc
 patching file Source/makefile
@@ -1022,16 +1021,16 @@ Hunk #2 succeeded at 194 (offset 51 lines).
 Hunk #3 succeeded at 272 (offset 51 lines).
 Hunk #4 succeeded at 300 (offset 51 lines).
 Hunk #5 succeeded at 345 (offset 51 lines).
-[1;32mDEBUG:  p.returncode = [0m 0 [1;30m[output.py at line 232][0m [0m
+[1;32mDEBUG:  p.returncode = [0m 0 [1;30m[output.py at line 237][0m [0m
 Output to directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j done.
 Type "launch" to generate events from this process, or see
 /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/README
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m11.764s
-user	0m8.242s
-sys	0m0.480s
+real	0m8.865s
+user	0m8.358s
+sys	0m0.477s
 ************************************************************
 *                                                          *
 *                      W E L C O M E to                    *
@@ -1059,9 +1058,6 @@ Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 No valid web browser found. Please set in ./input/mg5_configuration.txt
 treatcards run
-[1;34mrun_card missed argument cudacpp_backend. Takes default: CPP[0m
-[1;34mrun_card missed argument cudacpp_backend. Takes default: CPP[0m
-[1;34mWARNING! CPPRunCard instance has no attribute path[0m
 quit
 INFO:  
 launch in debug mode
diff --git a/epochX/cudacpp/pp_tt012j.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/pp_tt012j.mad/Cards/proc_card_mg5.dat
index c0b1a2fd98..22837639b6 100644
--- a/epochX/cudacpp/pp_tt012j.mad/Cards/proc_card_mg5.dat
+++ b/epochX/cudacpp/pp_tt012j.mad/Cards/proc_card_mg5.dat
@@ -50,4 +50,4 @@ generate p p > t t~ @0
 add process p p > t t~ j @1
 add process p p > t t~ j j @2
 output madevent ../TMPOUT/CODEGEN_mad_pp_tt012j --hel_recycling=False \
---vector_size=16384 --me_exporter=standalone_cudacpp
+--vector_size=32 --me_exporter=standalone_cudacpp
diff --git a/epochX/cudacpp/pp_tt012j.mad/Cards/run_card.dat b/epochX/cudacpp/pp_tt012j.mad/Cards/run_card.dat
index f4ca7c0b5d..8b1007a3e5 100644
--- a/epochX/cudacpp/pp_tt012j.mad/Cards/run_card.dat
+++ b/epochX/cudacpp/pp_tt012j.mad/Cards/run_card.dat
@@ -98,7 +98,23 @@
    1  = sde_strategy  ! default integration strategy (hep-ph/2021.00773)
                              ! 1 is old strategy (using amp square)
 			     ! 2 is new strategy (using only the denominator)
-# To see advanced option for Phase-Space optimization: type "update psoptim"			     
+#*********************************************************************
+# Phase-Space Optim (advanced)
+#*********************************************************************
+   0 = job_strategy ! see appendix of 1507.00020 (page 26)
+   0 =  hard_survey ! force to have better estimate of the integral at survey for difficult mode like interference
+   -1.0 = tmin_for_channel ! limit the non-singular reach of --some-- channel of integration related to T-channel diagram (value between -1 and 0), -1 is no impact
+   -1 = survey_splitting ! for loop-induced control how many core are used at survey for the computation of a single iteration.
+   2 = survey_nchannel_per_job ! control how many Channel are integrated inside a single job on cluster/multicore
+   -1 = refine_evt_by_job ! control the maximal number of events for the first iteration of the refine (larger means less jobs)  
+#*********************************************************************
+# Compilation flag. 
+#*********************************************************************   
+   -O3 -ffast-math -fbounds-check = global_flag ! build flags for all Fortran code (for a fair comparison to cudacpp; default is -O)
+   --fast-math  = aloha_flag ! fortran optimization flag for aloha function. Suggestions: '-ffast-math'
+   -O3 = matrix_flag ! fortran optimization flag for matrix.f function. Suggestions: '-O3'
+   16384 = vector_size ! size of fortran arrays allocated in the multi-event API for SIMD/GPU (VECSIZE_MEMMAX)
+			     
 #*********************************************************************
 # Customization (custom cuts/scale/bias/...)                         *
 # list of files containing fortran function that overwrite default   *
@@ -186,12 +202,10 @@
 systematics = systematics_program ! none, systematics [python], SysCalc [depreceted, C++]
 ['--mur=0.5,1,2', '--muf=0.5,1,2', '--pdf=errorset', '--alps=0.5,1,2'] = systematics_arguments ! see: https://cp3.irmp.ucl.ac.be/projects/madgraph/wiki/Systematics#Systematicspythonmodule
 
-#*********************************************************************
-# Options for the cudacpp plugin
-#*********************************************************************
-
-# Set cudacpp-specific values of non-cudacpp-specific options
--O3 -ffast-math -fbounds-check = global_flag ! build flags for Fortran code (for a fair comparison to cudacpp)
+#***********************************************************************
+# SIMD/GPU configuration for the CUDACPP plugin
+#************************************************************************
+ d = floating_type ! floating point precision: f (single), d (double), m (mixed: double for amplitudes, single for colors)
+ auto = avx_level ! SIMD vectorization level: none, sse4, avx2, 512y, 512z, auto
+ CPP = cudacpp_backend ! CUDACPP backend: FORTRAN, CPP, CUDA
 
-# New cudacpp-specific options (default values are defined in banner.py)
-CPP = cudacpp_backend ! valid backends are FORTRAN, CPP, CUDA
diff --git a/epochX/cudacpp/pp_tt012j.mad/Cards/run_card_default.dat b/epochX/cudacpp/pp_tt012j.mad/Cards/run_card_default.dat
index 58e12c11d7..2f72b65255 100644
--- a/epochX/cudacpp/pp_tt012j.mad/Cards/run_card_default.dat
+++ b/epochX/cudacpp/pp_tt012j.mad/Cards/run_card_default.dat
@@ -98,7 +98,23 @@
    1  = sde_strategy  ! default integration strategy (hep-ph/2021.00773)
                              ! 1 is old strategy (using amp square)
 			     ! 2 is new strategy (using only the denominator)
-# To see advanced option for Phase-Space optimization: type "update psoptim"			     
+#*********************************************************************
+# Phase-Space Optim (advanced)
+#*********************************************************************
+   0 = job_strategy ! see appendix of 1507.00020 (page 26)
+   0 =  hard_survey ! force to have better estimate of the integral at survey for difficult mode like interference
+   -1.0 = tmin_for_channel ! limit the non-singular reach of --some-- channel of integration related to T-channel diagram (value between -1 and 0), -1 is no impact
+   -1 = survey_splitting ! for loop-induced control how many core are used at survey for the computation of a single iteration.
+   2 = survey_nchannel_per_job ! control how many Channel are integrated inside a single job on cluster/multicore
+   -1 = refine_evt_by_job ! control the maximal number of events for the first iteration of the refine (larger means less jobs)  
+#*********************************************************************
+# Compilation flag. 
+#*********************************************************************   
+   -O = global_flag ! fortran optimization flag use for the all code.
+   --fast-math  = aloha_flag ! fortran optimization flag for aloha function. Suggestions: '-ffast-math'
+   -O3 = matrix_flag ! fortran optimization flag for matrix.f function. Suggestions: '-O3'
+   16 = vector_size ! size of fortran arrays allocated in the multi-event API for SIMD/GPU (VECSIZE_MEMMAX)
+			     
 #*********************************************************************
 # Customization (custom cuts/scale/bias/...)                         *
 # list of files containing fortran function that overwrite default   *
@@ -185,3 +201,11 @@
 #
 systematics = systematics_program ! none, systematics [python], SysCalc [depreceted, C++]
 ['--mur=0.5,1,2', '--muf=0.5,1,2', '--pdf=errorset', '--alps=0.5,1,2'] = systematics_arguments ! see: https://cp3.irmp.ucl.ac.be/projects/madgraph/wiki/Systematics#Systematicspythonmodule
+
+#***********************************************************************
+# SIMD/GPU configuration for the CUDACPP plugin
+#************************************************************************
+ d = floating_type ! floating point precision: f (single), d (double), m (mixed: double for amplitudes, single for colors)
+ auto = avx_level ! SIMD vectorization level: none, sse4, avx2, 512y, 512z, auto
+ CPP = cudacpp_backend ! CUDACPP backend: FORTRAN, CPP, CUDA
+
diff --git a/epochX/cudacpp/pp_tt012j.mad/Source/make_opts b/epochX/cudacpp/pp_tt012j.mad/Source/make_opts
index 6409e99a49..e4b87ee6ad 100644
--- a/epochX/cudacpp/pp_tt012j.mad/Source/make_opts
+++ b/epochX/cudacpp/pp_tt012j.mad/Source/make_opts
@@ -1,7 +1,6 @@
 DEFAULT_CPP_COMPILER=g++
 DEFAULT_F2PY_COMPILER=f2py3
 DEFAULT_F_COMPILER=gfortran
-#end_of_make_opts_variables
 GLOBAL_FLAG=-O3 -ffast-math -fbounds-check
 MACFLAG=
 MG5AMC_VERSION=SpecifiedByMG5aMCAtRunTime
diff --git a/epochX/cudacpp/pp_tt012j.mad/bin/internal/banner.py b/epochX/cudacpp/pp_tt012j.mad/bin/internal/banner.py
index 3995ce8109..bd1517985f 100755
--- a/epochX/cudacpp/pp_tt012j.mad/bin/internal/banner.py
+++ b/epochX/cudacpp/pp_tt012j.mad/bin/internal/banner.py
@@ -2822,13 +2822,15 @@ def add_param(self, name, value, fortran_name=None, include=True,
         if fct_mod:
             self.fct_mod[name] = fct_mod
 
-    def read(self, finput, consistency=True, unknown_warning=True):
+    def read(self, finput, consistency=True, unknown_warning=True, **opt):
         """Read the input file, this can be a path to a file, 
            a file object, a str with the content of the file."""
            
         if isinstance(finput, str):
             if "\n" in finput:
                 finput = finput.split('\n')
+                if 'path' in opt:
+                    self.path = opt['path']
             elif os.path.isfile(finput):
                 self.path = finput
                 finput = open(finput)
@@ -3913,7 +3915,7 @@ def remove_all_cut(self):
    %(global_flag)s = global_flag ! fortran optimization flag use for the all code.
    %(aloha_flag)s  = aloha_flag ! fortran optimization flag for aloha function. Suggestions: '-ffast-math'
    %(matrix_flag)s = matrix_flag ! fortran optimization flag for matrix.f function. Suggestions: '-O3'
-   %(vector_size)s = vector_size ! size designed for SIMD/OpenMP/GPU (number of events in lockstep)
+   %(vector_size)s = vector_size ! size of fortran arrays allocated in the multi-event API for SIMD/GPU (VECSIZE_MEMMAX)
 """
 
 template_off = '# To see advanced option for Phase-Space optimization: type "update psoptim"'
diff --git a/epochX/cudacpp/pp_tt012j.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/pp_tt012j.mad/bin/internal/launch_plugin.py
index 0b849330ef..3b09713e12 100644
--- a/epochX/cudacpp/pp_tt012j.mad/bin/internal/launch_plugin.py
+++ b/epochX/cudacpp/pp_tt012j.mad/bin/internal/launch_plugin.py
@@ -16,22 +16,28 @@
     import internal.misc as misc
     import internal.extended_cmd as extended_cmd
     import internal.banner as banner_mod
+    import internal.common_run_interface as common_run_interface
 else:
     import madgraph.interface.madevent_interface as madevent_interface
     import madgraph.various.misc as misc
     import madgraph.interface.extended_cmd as extended_cmd
     import madgraph.various.banner as banner_mod
+    import madgraph.interface.common_run_interface as common_run_interface
 
 class CPPMEInterface(madevent_interface.MadEventCmdShell):
     def compile(self, *args, **opts):
         """ """
         import multiprocessing
         if not self.options['nb_core'] or self.options['nb_core'] == 'None':
-            self.options['nb_core'] = multiprocessing.cpu_count()
-        if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'):
-            import pathlib
-            import os
-            pjoin = os.path.join
+            self.options['nb_core'] = multiprocessing.cpu_count()    
+        if 'cwd' in opts and os.path.basename(opts['cwd']) == 'Source':
+            path = pjoin(opts['cwd'], 'make_opts')
+            avx_level = self.run_card['avx_level'] if self.run_card['avx_level'] != 'auto' else ''
+            common_run_interface.CommonRunCmd.update_make_opts_full(path,
+                {'FPTYPE': self.run_card['floating_type'],
+                 'AVX': avx_level })
+            misc.sprint('FPTYPE checked')
+        if args and args[0][0] == 'madevent' and hasattr(self, 'run_card'):            
             cudacpp_backend = self.run_card['cudacpp_backend'].upper() # the default value is defined in banner.py
             logger.info("Building madevent in madevent_interface.py with '%s' matrix elements"%cudacpp_backend)
             if cudacpp_backend == 'FORTRAN':
@@ -46,30 +52,83 @@ def compile(self, *args, **opts):
         else:
             return misc.compile(nb_core=self.options['nb_core'], *args, **opts)
 
+# Phase-Space Optimization ------------------------------------------------------------------------------------
+template_on = \
+"""#***********************************************************************
+# SIMD/GPU configuration for the CUDACPP plugin
+#************************************************************************
+ %(floating_type)s = floating_type ! floating point precision: f (single), d (double), m (mixed: double for amplitudes, single for colors)
+ %(avx_level)s = avx_level ! SIMD vectorization level: none, sse4, avx2, 512y, 512z, auto
+ %(cudacpp_backend)s = cudacpp_backend ! CUDACPP backend: FORTRAN, CPP, CUDA
+"""
+
+template_off = ''
+plugin_block = banner_mod.RunBlock('simd', template_on=template_on, template_off=template_off)
+
 class CPPRunCard(banner_mod.RunCardLO):
+    blocks = banner_mod.RunCardLO.blocks + [plugin_block]
+
     def reset_simd(self, old_value, new_value, name):
         if not hasattr(self, 'path'):
-            logger.warning('WARNING! CPPRunCard instance has no attribute path')
-            return
-            ###raise Exception('INTERNAL ERROR! CPPRunCard instance has no attribute path')
+            raise Exception('INTERNAL ERROR! CPPRunCard instance has no attribute path') # now ok after fixing #790
         if name == "vector_size" and new_value <= int(old_value):
             # code can handle the new size -> do not recompile
             return
         Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source')
         subprocess.call(['make', 'cleanavx'], cwd=Sourcedir, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
 
+    def reset_makeopts(self, old_value, new_value, name):
+        if not hasattr(self, 'path'):
+            raise Exception
+        avx_value = self['avx_level'] if self['avx_level'] != 'auto' else ''
+        if name == 'floating_type':
+            common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': new_value, 'AVX': avx_value})
+        elif name == 'avx_level':
+            if new_value == 'auto': new_value = ''
+            common_run_interface.CommonRunCmd.update_make_opts_full({'FPTYPE': self['floating_type'], 'AVX': new_value})
+        else:
+            raise Exception
+        Sourcedir = pjoin(os.path.dirname(os.path.dirname(self.path)), 'Source')
+        subprocess.call(['make', 'cleanavx'], cwd=Sourcedir, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+
     def plugin_input(self, finput):
         return
 
     def default_setup(self):
         super().default_setup()
-        self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False)
+        self.add_param('floating_type', 'd', include=False, hidden=False,
+                       fct_mod=(self.reset_makeopts,(),{}),
+                       allowed=['m','d','f'])
+        self.add_param('avx_level', 'auto', include=False, hidden=False,
+                       fct_mod=(self.reset_makeopts,(),{}),
+                       allowed=['auto', 'none', 'sse4', 'avx2','512y','512z'])
+        self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False,
+                       allowed=['Fortan', 'CPP', 'CUDA'])
+        self['vector_size'] = 16 # already setup in default class (just change value)
+        self['aloha_flag'] = '--fast-math'
+        self['matrix_flag'] = '-O3'
+        self.display_block.append('simd')
+        self.display_block.append('psoptim')
 
+    # OM/AV - overload the default version in banner.py
     def write_one_include_file(self, output_dir, incname, output_file=None):
         """write one include file at the time"""
-        if incname == "vector.inc" and 'vector_size' not in self.user_set:
-            return
-        super().write_one_include_file(output_dir, incname, output_file)
+        if incname == "vector.inc":
+            if 'vector_size' not in self.user_set: return
+            if output_file is None: vectorinc=pjoin(output_dir,incname)
+            else: vectorinc=output_file
+            with open(vectorinc+'.new','w') as fileout:
+                with open(vectorinc) as filein:
+                    for line in filein:
+                        if line.startswith('C'): fileout.write(line)
+            super().write_one_include_file(output_dir, incname, output_file)
+            with open(vectorinc+'.new','a') as fileout:
+                with open(vectorinc) as filein:
+                    for line in filein:
+                        if not line.startswith('\n'): fileout.write(line)
+            os.replace(vectorinc+'.new',vectorinc)
+        else:
+            super().write_one_include_file(output_dir, incname, output_file)
 
     def check_validity(self):
         """ensure that PLUGIN information are consistent"""
@@ -82,13 +141,10 @@ def check_validity(self):
 
 class GPURunCard(CPPRunCard):
     def default_setup(self):
-        super(CPPRunCard, self).default_setup()
-        self.add_param('cudacpp_backend', 'CUDA', include=False, hidden=False)
-
-#class CUDACPPRunCard(CPPRunCard):
-#    def default_setup(self):
-#        super(CPPRunCard, self).default_setup()
-#        self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False)
+        super().default_setup()
+        # change default value:
+        self['cudacpp_backend'] = 'CUDA'
+        self['vector_size'] = 16384 # already setup in default class (just change value)
 
 MEINTERFACE = CPPMEInterface
 RunCard = CPPRunCard
diff --git a/epochX/cudacpp/pp_tt012j.mad/bin/internal/madevent_interface.py b/epochX/cudacpp/pp_tt012j.mad/bin/internal/madevent_interface.py
index 853aabc98a..cb6bf4ca57 100755
--- a/epochX/cudacpp/pp_tt012j.mad/bin/internal/madevent_interface.py
+++ b/epochX/cudacpp/pp_tt012j.mad/bin/internal/madevent_interface.py
@@ -3191,7 +3191,7 @@ def do_treatcards(self, line, mode=None, opt=None):
       
         if mode in ['run', 'all']:
             if not hasattr(self, 'run_card'):
-                run_card = banner_mod.RunCard(opt['run_card'])
+                run_card = banner_mod.RunCard(opt['run_card'], path=pjoin(self.me_dir, 'Cards', 'run_card.dat'))
             else:
                 run_card = self.run_card
             self.run_card = run_card
diff --git a/epochX/cudacpp/pp_tt012j.mad/mg5.in b/epochX/cudacpp/pp_tt012j.mad/mg5.in
index 91e22f5295..6bc40e7968 100644
--- a/epochX/cudacpp/pp_tt012j.mad/mg5.in
+++ b/epochX/cudacpp/pp_tt012j.mad/mg5.in
@@ -4,4 +4,4 @@ define j = p
 generate p p > t t~ @0
 add process p p > t t~ j @1
 add process p p > t t~ j j @2
-output madevent pp_tt012j.mad --hel_recycling=False --vector_size=16384 --me_exporter=standalone_cudacpp
+output madevent pp_tt012j.mad --hel_recycling=False --vector_size=32 --me_exporter=standalone_cudacpp